diff options
Diffstat (limited to 'media/libjpeg')
178 files changed, 87847 insertions, 0 deletions
diff --git a/media/libjpeg/1050342.diff b/media/libjpeg/1050342.diff new file mode 100644 index 000000000..a409ca2c4 --- /dev/null +++ b/media/libjpeg/1050342.diff @@ -0,0 +1,121 @@ +Bug 1050342. Fix a case where the fast huffman decoder in libjpeg-turbo can produce different results depending on how data is fed to it. + +This change comes from the blink repo https://codereview.appspot.com/229430043/ and is unlikely to be accepted upstream into libjpeg-turbo. + +diff --git jdhuff.c jdhuff.c +--- jdhuff.c ++++ jdhuff.c +@@ -664,17 +664,17 @@ decode_mcu_fast (j_decompress_ptr cinfo, + ASSIGN_STATE(state, entropy->saved); + + for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) { + JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL; + d_derived_tbl *dctbl = entropy->dc_cur_tbls[blkn]; + d_derived_tbl *actbl = entropy->ac_cur_tbls[blkn]; + register int s, k, r, l; + +- HUFF_DECODE_FAST(s, l, dctbl); ++ HUFF_DECODE_FAST(s, l, dctbl, slow_decode_mcu); + if (s) { + FILL_BIT_BUFFER_FAST + r = GET_BITS(s); + s = HUFF_EXTEND(r, s); + } + + if (entropy->dc_needed[blkn]) { + int ci = cinfo->MCU_membership[blkn]; +@@ -682,17 +682,17 @@ decode_mcu_fast (j_decompress_ptr cinfo, + state.last_dc_val[ci] = s; + if (block) + (*block)[0] = (JCOEF) s; + } + + if (entropy->ac_needed[blkn] && block) { + + for (k = 1; k < DCTSIZE2; k++) { +- HUFF_DECODE_FAST(s, l, actbl); ++ HUFF_DECODE_FAST(s, l, actbl, slow_decode_mcu); + r = s >> 4; + s &= 15; + + if (s) { + k += r; + FILL_BIT_BUFFER_FAST + r = GET_BITS(s); + s = HUFF_EXTEND(r, s); +@@ -701,33 +701,34 @@ decode_mcu_fast (j_decompress_ptr cinfo, + if (r != 15) break; + k += 15; + } + } + + } else { + + for (k = 1; k < DCTSIZE2; k++) { +- HUFF_DECODE_FAST(s, l, actbl); ++ HUFF_DECODE_FAST(s, l, actbl, slow_decode_mcu); + r = s >> 4; + s &= 15; + + if (s) { + k += r; + FILL_BIT_BUFFER_FAST + DROP_BITS(s); + } else { + if (r != 15) break; + k += 15; + } + } + } + } + + if (cinfo->unread_marker != 0) { ++slow_decode_mcu: + cinfo->unread_marker = 0; + return FALSE; + } + + br_state.bytes_in_buffer -= (buffer - br_state.next_input_byte); + br_state.next_input_byte = buffer; + BITREAD_SAVE_STATE(cinfo,entropy->bitstate); + ASSIGN_STATE(entropy->saved, state); +diff --git jdhuff.h jdhuff.h +--- jdhuff.h ++++ jdhuff.h +@@ -203,32 +203,34 @@ EXTERN(boolean) jpeg_fill_bit_buffer + } else { \ + slowlabel: \ + if ((result=jpeg_huff_decode(&state,get_buffer,bits_left,htbl,nb)) < 0) \ + { failaction; } \ + get_buffer = state.get_buffer; bits_left = state.bits_left; \ + } \ + } + +-#define HUFF_DECODE_FAST(s,nb,htbl) \ ++#define HUFF_DECODE_FAST(s,nb,htbl,slowlabel) \ + FILL_BIT_BUFFER_FAST; \ + s = PEEK_BITS(HUFF_LOOKAHEAD); \ + s = htbl->lookup[s]; \ + nb = s >> HUFF_LOOKAHEAD; \ + /* Pre-execute the common case of nb <= HUFF_LOOKAHEAD */ \ + DROP_BITS(nb); \ + s = s & ((1 << HUFF_LOOKAHEAD) - 1); \ + if (nb > HUFF_LOOKAHEAD) { \ + /* Equivalent of jpeg_huff_decode() */ \ + /* Don't use GET_BITS() here because we don't want to modify bits_left */ \ + s = (get_buffer >> bits_left) & ((1 << (nb)) - 1); \ + while (s > htbl->maxcode[nb]) { \ + s <<= 1; \ + s |= GET_BITS(1); \ + nb++; \ + } \ +- s = htbl->pub->huffval[ (int) (s + htbl->valoffset[nb]) & 0xFF ]; \ ++ if (nb > 16) \ ++ goto slowlabel; \ ++ s = htbl->pub->huffval[ (int) (s + htbl->valoffset[nb]) ]; \ + } + + /* Out-of-line case for Huffman code fetching */ + EXTERN(int) jpeg_huff_decode + (bitread_working_state *state, register bit_buf_type get_buffer, + register int bits_left, d_derived_tbl *htbl, int min_bits); diff --git a/media/libjpeg/LICENSE.md b/media/libjpeg/LICENSE.md new file mode 100644 index 000000000..4623e2942 --- /dev/null +++ b/media/libjpeg/LICENSE.md @@ -0,0 +1,88 @@ +libjpeg-turbo Licenses +====================== + +libjpeg-turbo is covered by three compatible BSD-style open source licenses: + +- The IJG (Independent JPEG Group) License, which is listed in + [README.ijg](README.ijg) + + This license applies to the libjpeg API library and associated programs + (any code inherited from libjpeg, and any modifications to that code.) + +- The Modified (3-clause) BSD License, which is listed in + [turbojpeg.c](turbojpeg.c) + + This license covers the TurboJPEG API library and associated programs. + +- The zlib License, which is listed in [simd/jsimdext.inc](simd/jsimdext.inc) + + This license is a subset of the other two, and it covers the libjpeg-turbo + SIMD extensions. + + +Complying with the libjpeg-turbo Licenses +========================================= + +This section provides a roll-up of the libjpeg-turbo licensing terms, to the +best of our understanding. + +1. If you are distributing a modified version of the libjpeg-turbo source, + then: + + 1. You cannot alter or remove any existing copyright or license notices + from the source. + + **Origin** + - Clause 1 of the IJG License + - Clause 1 of the Modified BSD License + - Clauses 1 and 3 of the zlib License + + 2. You must add your own copyright notice to the header of each source + file you modified, so others can tell that you modified that file (if + there is not an existing copyright header in that file, then you can + simply add a notice stating that you modified the file.) + + **Origin** + - Clause 1 of the IJG License + - Clause 2 of the zlib License + + 3. You must include the IJG README file, and you must not alter any of the + copyright or license text in that file. + + **Origin** + - Clause 1 of the IJG License + +2. If you are distributing only libjpeg-turbo binaries without the source, or + if you are distributing an application that statically links with + libjpeg-turbo, then: + + 1. Your product documentation must include a message stating: + + This software is based in part on the work of the Independent JPEG + Group. + + **Origin** + - Clause 2 of the IJG license + + 2. If your binary distribution includes or uses the TurboJPEG API, then + your product documentation must include the text of the Modified BSD + License. + + **Origin** + - Clause 2 of the Modified BSD License + +3. You cannot use the name of the IJG or The libjpeg-turbo Project or the + contributors thereof in advertising, publicity, etc. + + **Origin** + - IJG License + - Clause 3 of the Modified BSD License + +4. The IJG and The libjpeg-turbo Project do not warrant libjpeg-turbo to be + free of defects, nor do we accept any liability for undesirable + consequences resulting from your use of the software. + + **Origin** + - IJG License + - Modified BSD License + - zlib License diff --git a/media/libjpeg/MOZCHANGES b/media/libjpeg/MOZCHANGES new file mode 100644 index 000000000..6e7824cdd --- /dev/null +++ b/media/libjpeg/MOZCHANGES @@ -0,0 +1,127 @@ +To upgrade to a new revision of libjpeg-turbo, do the following: + +* Check out libjpeg-turbo from git: + + $ git clone https://github.com/libjpeg-turbo/libjpeg-turbo.git + +* In a clean clone of mozilla-central, run the update script (tag defaults to HEAD): + + $ ./media/update-libjpeg.sh /path/to/libjpeg-turbo [tag] + + and fix up any rejects from applying the Mozilla specific patches at the end + of that script. + +* Since libjpeg-turbo normally creates jconfig.h and jconfigint.h at build time + and we use pre-generated versions, changes to jconfig.h.in and jconfigint.h.in + should be looked for and noted for later inclusion. + +* Now look through the new files and rm any which are npotb. When I upgraded + to libjpeg-turbo 1.1.0, the only files I kept which didn't match + + *.c *.h *.asm *.inc + + were README and README-turbo. + + You can easily look for all non *.c, *.h, *.asm, and *.inc files by running + + $ hg status -nu | grep -v '\(c\|h\|asm\|inc\|md\|ijg\)$' + + Once you're comfortable that you're only deleting files you want to delete + (and you've hg add'ed the files you want to keep), you can nuke the remaining + files with + + $ hg status -nu | grep -v '\(c\|h\|asm\|inc\|md\|ijg\)$' | xargs rm + + A helpful command for finding the *.c files which aren't *currently* part of + the build is + + diff <(ls *.c | sort) <(grep -o '\w*\.c' Makefile.in | sort) + + Of course, libjpeg-turbo might have added some new source files, so you'll + have to look though and figure out which of these files to keep. + +* Update jconfig.h and jconfigint.h as noted previously. + +* Update moz.build to build any new files. + +* Finally, tell hg that we've added or removed some files: + + $ hg addremove + +== September 22, 2016 (libjpeg-turbo v1.5.1 cb88e5da8003afcdc443b787fdcb77285e5a8a02 2016-09-20) == + +* Updated to v1.5.1 release. + +== June 23, 2016 (libjpeg-turbo v1.5.0 3ff13e651bbe6de9c6f15d05235d1d4f26f63ffc 2016-05-31) == + +* Updated to v1.5.0 release. + +== October 5, 2015 (libjpeg-turbo v1.4.2 d8da49effe6460d55239c4c009c57f42d8e4a494 2015-09-21) == + +* Updated to v1.4.2 release. + +== January 15, 2015 (libjpeg-turbo v1.4.0 r1481 2015-01-07) == + +* Updated to v1.4.0 release. + +== March 24, 2014 (libjpeg-turbo v1.3.1 r1205 2014-03-22) == + +* Updated to v1.3.1 release. + +== November 25, 2013 == + +* Fix bug 891693. + +== June 4, 2013 (libjpeg-turbo v1.3.0 r988 2013-05-25) == + +* Updated to v1.3.0 release. + +== December 12, 2012 == + +* Replace the runtime computed jpeg_nbits_table with constants in + jpeg_nbits_table.h to make it shareable among processes. (bug 815473) + +== October 13, 2012 == + +* Modified config.h to use MOZ_ALWAYS_INLINE (bug 800106). + +== July 4, 2012 (libjpeg-turbo v1.2.1 r853 2012-06-30) == + +* Updated to v1.2.1 stable release. + +== June 5, 2012 (libjpeg-turbo v1.2.x branch, r831 2012-05-30) == + +* Updated to latest version on v1.2.x branch (bug 759891). + +== February 10, 2012 (libjpeg-turbo v1.2.0 r807 2012-02-10) == + +* Imported jchuff.c, jdhuff.c, jdhuff.h under new licensing. + +* Created mozilla.diff for the required jmorecfg.h changes and to allow for any + future changes made by Mozilla to upstream files. + +* Removed the following files which are unused by the Mozilla build: + + cderror.h, cdjpeg.h, jconfig.h.in, transupp.h, simd/jsimdcfg.inc.h + + +== March 28, 2011 (initial commit, libjpeg-turbo v1.1.0 r469 2011-02-27) == + +* Modified jmorecfg.h to define UINT8, UINT16, INT16, and INT32 in terms of + prtypes to fix a build error on Windows. + +* Defined INLINE as NS_ALWAYS_INLINE in jconfig.h. + +* Removed the following files which are licensed under the wxWindows license: + + bmp.c, bmp.h, jpegut.c, jpgtest.cxx, rrtimer.h, rrutil.h, turbojpeg.h, + turbojpegl.c + +* Reverted the following files to what was previously in Mozilla's tree + (nominally libjpeg 6.2): + + jchuff.c, jdhuff.c, jdhuff.h + + since the versions of these files in libjpeg-turbo are also under the + wxWindows license. (It would have been nicer to revert them to the new + libjpeg-8b code, but that doesn't easily integrate with libjpeg-turbo.) diff --git a/media/libjpeg/README.ijg b/media/libjpeg/README.ijg new file mode 100644 index 000000000..9c450ceb0 --- /dev/null +++ b/media/libjpeg/README.ijg @@ -0,0 +1,279 @@ +libjpeg-turbo note: This file has been modified by The libjpeg-turbo Project +to include only information relevant to libjpeg-turbo, to wordsmith certain +sections, and to remove impolitic language that existed in the libjpeg v8 +README. It is included only for reference. Please see README.md for +information specific to libjpeg-turbo. + + +The Independent JPEG Group's JPEG software +========================================== + +This distribution contains a release of the Independent JPEG Group's free JPEG +software. You are welcome to redistribute this software and to use it for any +purpose, subject to the conditions under LEGAL ISSUES, below. + +This software is the work of Tom Lane, Guido Vollbeding, Philip Gladstone, +Bill Allombert, Jim Boucher, Lee Crocker, Bob Friesenhahn, Ben Jackson, +Julian Minguillon, Luis Ortiz, George Phillips, Davide Rossi, Ge' Weijers, +and other members of the Independent JPEG Group. + +IJG is not affiliated with the ISO/IEC JTC1/SC29/WG1 standards committee +(also known as JPEG, together with ITU-T SG16). + + +DOCUMENTATION ROADMAP +===================== + +This file contains the following sections: + +OVERVIEW General description of JPEG and the IJG software. +LEGAL ISSUES Copyright, lack of warranty, terms of distribution. +REFERENCES Where to learn more about JPEG. +ARCHIVE LOCATIONS Where to find newer versions of this software. +FILE FORMAT WARS Software *not* to get. +TO DO Plans for future IJG releases. + +Other documentation files in the distribution are: + +User documentation: + usage.txt Usage instructions for cjpeg, djpeg, jpegtran, + rdjpgcom, and wrjpgcom. + *.1 Unix-style man pages for programs (same info as usage.txt). + wizard.txt Advanced usage instructions for JPEG wizards only. + change.log Version-to-version change highlights. +Programmer and internal documentation: + libjpeg.txt How to use the JPEG library in your own programs. + example.c Sample code for calling the JPEG library. + structure.txt Overview of the JPEG library's internal structure. + coderules.txt Coding style rules --- please read if you contribute code. + +Please read at least usage.txt. Some information can also be found in the JPEG +FAQ (Frequently Asked Questions) article. See ARCHIVE LOCATIONS below to find +out where to obtain the FAQ article. + +If you want to understand how the JPEG code works, we suggest reading one or +more of the REFERENCES, then looking at the documentation files (in roughly +the order listed) before diving into the code. + + +OVERVIEW +======== + +This package contains C software to implement JPEG image encoding, decoding, +and transcoding. JPEG (pronounced "jay-peg") is a standardized compression +method for full-color and grayscale images. JPEG's strong suit is compressing +photographic images or other types of images that have smooth color and +brightness transitions between neighboring pixels. Images with sharp lines or +other abrupt features may not compress well with JPEG, and a higher JPEG +quality may have to be used to avoid visible compression artifacts with such +images. + +JPEG is lossy, meaning that the output pixels are not necessarily identical to +the input pixels. However, on photographic content and other "smooth" images, +very good compression ratios can be obtained with no visible compression +artifacts, and extremely high compression ratios are possible if you are +willing to sacrifice image quality (by reducing the "quality" setting in the +compressor.) + +This software implements JPEG baseline, extended-sequential, and progressive +compression processes. Provision is made for supporting all variants of these +processes, although some uncommon parameter settings aren't implemented yet. +We have made no provision for supporting the hierarchical or lossless +processes defined in the standard. + +We provide a set of library routines for reading and writing JPEG image files, +plus two sample applications "cjpeg" and "djpeg", which use the library to +perform conversion between JPEG and some other popular image file formats. +The library is intended to be reused in other applications. + +In order to support file conversion and viewing software, we have included +considerable functionality beyond the bare JPEG coding/decoding capability; +for example, the color quantization modules are not strictly part of JPEG +decoding, but they are essential for output to colormapped file formats or +colormapped displays. These extra functions can be compiled out of the +library if not required for a particular application. + +We have also included "jpegtran", a utility for lossless transcoding between +different JPEG processes, and "rdjpgcom" and "wrjpgcom", two simple +applications for inserting and extracting textual comments in JFIF files. + +The emphasis in designing this software has been on achieving portability and +flexibility, while also making it fast enough to be useful. In particular, +the software is not intended to be read as a tutorial on JPEG. (See the +REFERENCES section for introductory material.) Rather, it is intended to +be reliable, portable, industrial-strength code. We do not claim to have +achieved that goal in every aspect of the software, but we strive for it. + +We welcome the use of this software as a component of commercial products. +No royalty is required, but we do ask for an acknowledgement in product +documentation, as described under LEGAL ISSUES. + + +LEGAL ISSUES +============ + +In plain English: + +1. We don't promise that this software works. (But if you find any bugs, + please let us know!) +2. You can use this software for whatever you want. You don't have to pay us. +3. You may not pretend that you wrote this software. If you use it in a + program, you must acknowledge somewhere in your documentation that + you've used the IJG code. + +In legalese: + +The authors make NO WARRANTY or representation, either express or implied, +with respect to this software, its quality, accuracy, merchantability, or +fitness for a particular purpose. This software is provided "AS IS", and you, +its user, assume the entire risk as to its quality and accuracy. + +This software is copyright (C) 1991-2016, Thomas G. Lane, Guido Vollbeding. +All Rights Reserved except as specified below. + +Permission is hereby granted to use, copy, modify, and distribute this +software (or portions thereof) for any purpose, without fee, subject to these +conditions: +(1) If any part of the source code for this software is distributed, then this +README file must be included, with this copyright and no-warranty notice +unaltered; and any additions, deletions, or changes to the original files +must be clearly indicated in accompanying documentation. +(2) If only executable code is distributed, then the accompanying +documentation must state that "this software is based in part on the work of +the Independent JPEG Group". +(3) Permission for use of this software is granted only if the user accepts +full responsibility for any undesirable consequences; the authors accept +NO LIABILITY for damages of any kind. + +These conditions apply to any software derived from or based on the IJG code, +not just to the unmodified library. If you use our work, you ought to +acknowledge us. + +Permission is NOT granted for the use of any IJG author's name or company name +in advertising or publicity relating to this software or products derived from +it. This software may be referred to only as "the Independent JPEG Group's +software". + +We specifically permit and encourage the use of this software as the basis of +commercial products, provided that all warranty or liability claims are +assumed by the product vendor. + + +The Unix configuration script "configure" was produced with GNU Autoconf. +It is copyright by the Free Software Foundation but is freely distributable. +The same holds for its supporting scripts (config.guess, config.sub, +ltmain.sh). Another support script, install-sh, is copyright by X Consortium +but is also freely distributable. + +The IJG distribution formerly included code to read and write GIF files. +To avoid entanglement with the Unisys LZW patent (now expired), GIF reading +support has been removed altogether, and the GIF writer has been simplified +to produce "uncompressed GIFs". This technique does not use the LZW +algorithm; the resulting GIF files are larger than usual, but are readable +by all standard GIF decoders. + +We are required to state that + "The Graphics Interchange Format(c) is the Copyright property of + CompuServe Incorporated. GIF(sm) is a Service Mark property of + CompuServe Incorporated." + + +REFERENCES +========== + +We recommend reading one or more of these references before trying to +understand the innards of the JPEG software. + +The best short technical introduction to the JPEG compression algorithm is + Wallace, Gregory K. "The JPEG Still Picture Compression Standard", + Communications of the ACM, April 1991 (vol. 34 no. 4), pp. 30-44. +(Adjacent articles in that issue discuss MPEG motion picture compression, +applications of JPEG, and related topics.) If you don't have the CACM issue +handy, a PDF file containing a revised version of Wallace's article is +available at http://www.ijg.org/files/Wallace.JPEG.pdf. The file (actually +a preprint for an article that appeared in IEEE Trans. Consumer Electronics) +omits the sample images that appeared in CACM, but it includes corrections +and some added material. Note: the Wallace article is copyright ACM and IEEE, +and it may not be used for commercial purposes. + +A somewhat less technical, more leisurely introduction to JPEG can be found in +"The Data Compression Book" by Mark Nelson and Jean-loup Gailly, published by +M&T Books (New York), 2nd ed. 1996, ISBN 1-55851-434-1. This book provides +good explanations and example C code for a multitude of compression methods +including JPEG. It is an excellent source if you are comfortable reading C +code but don't know much about data compression in general. The book's JPEG +sample code is far from industrial-strength, but when you are ready to look +at a full implementation, you've got one here... + +The best currently available description of JPEG is the textbook "JPEG Still +Image Data Compression Standard" by William B. Pennebaker and Joan L. +Mitchell, published by Van Nostrand Reinhold, 1993, ISBN 0-442-01272-1. +Price US$59.95, 638 pp. The book includes the complete text of the ISO JPEG +standards (DIS 10918-1 and draft DIS 10918-2). + +The original JPEG standard is divided into two parts, Part 1 being the actual +specification, while Part 2 covers compliance testing methods. Part 1 is +titled "Digital Compression and Coding of Continuous-tone Still Images, +Part 1: Requirements and guidelines" and has document numbers ISO/IEC IS +10918-1, ITU-T T.81. Part 2 is titled "Digital Compression and Coding of +Continuous-tone Still Images, Part 2: Compliance testing" and has document +numbers ISO/IEC IS 10918-2, ITU-T T.83. + +The JPEG standard does not specify all details of an interchangeable file +format. For the omitted details we follow the "JFIF" conventions, revision +1.02. JFIF 1.02 has been adopted as an Ecma International Technical Report +and thus received a formal publication status. It is available as a free +download in PDF format from +http://www.ecma-international.org/publications/techreports/E-TR-098.htm. +A PostScript version of the JFIF document is available at +http://www.ijg.org/files/jfif.ps.gz. There is also a plain text version at +http://www.ijg.org/files/jfif.txt.gz, but it is missing the figures. + +The TIFF 6.0 file format specification can be obtained by FTP from +ftp://ftp.sgi.com/graphics/tiff/TIFF6.ps.gz. The JPEG incorporation scheme +found in the TIFF 6.0 spec of 3-June-92 has a number of serious problems. +IJG does not recommend use of the TIFF 6.0 design (TIFF Compression tag 6). +Instead, we recommend the JPEG design proposed by TIFF Technical Note #2 +(Compression tag 7). Copies of this Note can be obtained from +http://www.ijg.org/files/. It is expected that the next revision +of the TIFF spec will replace the 6.0 JPEG design with the Note's design. +Although IJG's own code does not support TIFF/JPEG, the free libtiff library +uses our library to implement TIFF/JPEG per the Note. + + +ARCHIVE LOCATIONS +================= + +The "official" archive site for this software is www.ijg.org. +The most recent released version can always be found there in +directory "files". + +The JPEG FAQ (Frequently Asked Questions) article is a source of some +general information about JPEG. +It is available on the World Wide Web at http://www.faqs.org/faqs/jpeg-faq/ +and other news.answers archive sites, including the official news.answers +archive at rtfm.mit.edu: ftp://rtfm.mit.edu/pub/usenet/news.answers/jpeg-faq/. +If you don't have Web or FTP access, send e-mail to mail-server@rtfm.mit.edu +with body + send usenet/news.answers/jpeg-faq/part1 + send usenet/news.answers/jpeg-faq/part2 + + +FILE FORMAT WARS +================ + +The ISO/IEC JTC1/SC29/WG1 standards committee (also known as JPEG, together +with ITU-T SG16) currently promotes different formats containing the name +"JPEG" which are incompatible with original DCT-based JPEG. IJG therefore does +not support these formats (see REFERENCES). Indeed, one of the original +reasons for developing this free software was to help force convergence on +common, interoperable format standards for JPEG files. +Don't use an incompatible file format! +(In any case, our decoder will remain capable of reading existing JPEG +image files indefinitely.) + + +TO DO +===== + +Please send bug reports, offers of help, etc. to jpeg-info@jpegclub.org. diff --git a/media/libjpeg/README.md b/media/libjpeg/README.md new file mode 100755 index 000000000..ca8866e06 --- /dev/null +++ b/media/libjpeg/README.md @@ -0,0 +1,341 @@ +Background +========== + +libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2, +NEON, AltiVec) to accelerate baseline JPEG compression and decompression on +x86, x86-64, ARM, and PowerPC systems. On such systems, libjpeg-turbo is +generally 2-6x as fast as libjpeg, all else being equal. On other types of +systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by +virtue of its highly-optimized Huffman coding routines. In many cases, the +performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs. + +libjpeg-turbo implements both the traditional libjpeg API as well as the less +powerful but more straightforward TurboJPEG API. libjpeg-turbo also features +colorspace extensions that allow it to compress from/decompress to 32-bit and +big-endian pixel buffers (RGBX, XBGR, etc.), as well as a full-featured Java +interface. + +libjpeg-turbo was originally based on libjpeg/SIMD, an MMX-accelerated +derivative of libjpeg v6b developed by Miyasaka Masaru. The TigerVNC and +VirtualGL projects made numerous enhancements to the codec in 2009, and in +early 2010, libjpeg-turbo spun off into an independent project, with the goal +of making high-speed JPEG compression/decompression technology available to a +broader range of users and developers. + + +License +======= + +libjpeg-turbo is covered by three compatible BSD-style open source licenses. +Refer to [LICENSE.md](LICENSE.md) for a roll-up of license terms. + + +Building libjpeg-turbo +====================== + +Refer to [BUILDING.md](BUILDING.md) for complete instructions. + + +Using libjpeg-turbo +=================== + +libjpeg-turbo includes two APIs that can be used to compress and decompress +JPEG images: + +- **TurboJPEG API** + This API provides an easy-to-use interface for compressing and decompressing + JPEG images in memory. It also provides some functionality that would not be + straightforward to achieve using the underlying libjpeg API, such as + generating planar YUV images and performing multiple simultaneous lossless + transforms on an image. The Java interface for libjpeg-turbo is written on + top of the TurboJPEG API. + +- **libjpeg API** + This is the de facto industry-standard API for compressing and decompressing + JPEG images. It is more difficult to use than the TurboJPEG API but also + more powerful. The libjpeg API implementation in libjpeg-turbo is both + API/ABI-compatible and mathematically compatible with libjpeg v6b. It can + also optionally be configured to be API/ABI-compatible with libjpeg v7 and v8 + (see below.) + +There is no significant performance advantage to either API when both are used +to perform similar operations. + +Colorspace Extensions +--------------------- + +libjpeg-turbo includes extensions that allow JPEG images to be compressed +directly from (and decompressed directly to) buffers that use BGR, BGRX, +RGBX, XBGR, and XRGB pixel ordering. This is implemented with ten new +colorspace constants: + + JCS_EXT_RGB /* red/green/blue */ + JCS_EXT_RGBX /* red/green/blue/x */ + JCS_EXT_BGR /* blue/green/red */ + JCS_EXT_BGRX /* blue/green/red/x */ + JCS_EXT_XBGR /* x/blue/green/red */ + JCS_EXT_XRGB /* x/red/green/blue */ + JCS_EXT_RGBA /* red/green/blue/alpha */ + JCS_EXT_BGRA /* blue/green/red/alpha */ + JCS_EXT_ABGR /* alpha/blue/green/red */ + JCS_EXT_ARGB /* alpha/red/green/blue */ + +Setting `cinfo.in_color_space` (compression) or `cinfo.out_color_space` +(decompression) to one of these values will cause libjpeg-turbo to read the +red, green, and blue values from (or write them to) the appropriate position in +the pixel when compressing from/decompressing to an RGB buffer. + +Your application can check for the existence of these extensions at compile +time with: + + #ifdef JCS_EXTENSIONS + +At run time, attempting to use these extensions with a libjpeg implementation +that does not support them will result in a "Bogus input colorspace" error. +Applications can trap this error in order to test whether run-time support is +available for the colorspace extensions. + +When using the RGBX, BGRX, XBGR, and XRGB colorspaces during decompression, the +X byte is undefined, and in order to ensure the best performance, libjpeg-turbo +can set that byte to whatever value it wishes. If an application expects the X +byte to be used as an alpha channel, then it should specify `JCS_EXT_RGBA`, +`JCS_EXT_BGRA`, `JCS_EXT_ABGR`, or `JCS_EXT_ARGB`. When these colorspace +constants are used, the X byte is guaranteed to be 0xFF, which is interpreted +as opaque. + +Your application can check for the existence of the alpha channel colorspace +extensions at compile time with: + + #ifdef JCS_ALPHA_EXTENSIONS + +[jcstest.c](jcstest.c), located in the libjpeg-turbo source tree, demonstrates +how to check for the existence of the colorspace extensions at compile time and +run time. + +libjpeg v7 and v8 API/ABI Emulation +----------------------------------- + +With libjpeg v7 and v8, new features were added that necessitated extending the +compression and decompression structures. Unfortunately, due to the exposed +nature of those structures, extending them also necessitated breaking backward +ABI compatibility with previous libjpeg releases. Thus, programs that were +built to use libjpeg v7 or v8 did not work with libjpeg-turbo, since it is +based on the libjpeg v6b code base. Although libjpeg v7 and v8 are not +as widely used as v6b, enough programs (including a few Linux distros) made +the switch that there was a demand to emulate the libjpeg v7 and v8 ABIs +in libjpeg-turbo. It should be noted, however, that this feature was added +primarily so that applications that had already been compiled to use libjpeg +v7+ could take advantage of accelerated baseline JPEG encoding/decoding +without recompiling. libjpeg-turbo does not claim to support all of the +libjpeg v7+ features, nor to produce identical output to libjpeg v7+ in all +cases (see below.) + +By passing an argument of `--with-jpeg7` or `--with-jpeg8` to `configure`, or +an argument of `-DWITH_JPEG7=1` or `-DWITH_JPEG8=1` to `cmake`, you can build a +version of libjpeg-turbo that emulates the libjpeg v7 or v8 ABI, so that +programs that are built against libjpeg v7 or v8 can be run with libjpeg-turbo. +The following section describes which libjpeg v7+ features are supported and +which aren't. + +### Support for libjpeg v7 and v8 Features + +#### Fully supported + +- **libjpeg: IDCT scaling extensions in decompressor** + libjpeg-turbo supports IDCT scaling with scaling factors of 1/8, 1/4, 3/8, + 1/2, 5/8, 3/4, 7/8, 9/8, 5/4, 11/8, 3/2, 13/8, 7/4, 15/8, and 2/1 (only 1/4 + and 1/2 are SIMD-accelerated.) + +- **libjpeg: Arithmetic coding** + +- **libjpeg: In-memory source and destination managers** + See notes below. + +- **cjpeg: Separate quality settings for luminance and chrominance** + Note that the libpjeg v7+ API was extended to accommodate this feature only + for convenience purposes. It has always been possible to implement this + feature with libjpeg v6b (see rdswitch.c for an example.) + +- **cjpeg: 32-bit BMP support** + +- **cjpeg: `-rgb` option** + +- **jpegtran: Lossless cropping** + +- **jpegtran: `-perfect` option** + +- **jpegtran: Forcing width/height when performing lossless crop** + +- **rdjpgcom: `-raw` option** + +- **rdjpgcom: Locale awareness** + + +#### Not supported + +NOTE: As of this writing, extensive research has been conducted into the +usefulness of DCT scaling as a means of data reduction and SmartScale as a +means of quality improvement. The reader is invited to peruse the research at +<http://www.libjpeg-turbo.org/About/SmartScale> and draw his/her own conclusions, +but it is the general belief of our project that these features have not +demonstrated sufficient usefulness to justify inclusion in libjpeg-turbo. + +- **libjpeg: DCT scaling in compressor** + `cinfo.scale_num` and `cinfo.scale_denom` are silently ignored. + There is no technical reason why DCT scaling could not be supported when + emulating the libjpeg v7+ API/ABI, but without the SmartScale extension (see + below), only scaling factors of 1/2, 8/15, 4/7, 8/13, 2/3, 8/11, 4/5, and + 8/9 would be available, which is of limited usefulness. + +- **libjpeg: SmartScale** + `cinfo.block_size` is silently ignored. + SmartScale is an extension to the JPEG format that allows for DCT block + sizes other than 8x8. Providing support for this new format would be + feasible (particularly without full acceleration.) However, until/unless + the format becomes either an official industry standard or, at minimum, an + accepted solution in the community, we are hesitant to implement it, as + there is no sense of whether or how it might change in the future. It is + our belief that SmartScale has not demonstrated sufficient usefulness as a + lossless format nor as a means of quality enhancement, and thus our primary + interest in providing this feature would be as a means of supporting + additional DCT scaling factors. + +- **libjpeg: Fancy downsampling in compressor** + `cinfo.do_fancy_downsampling` is silently ignored. + This requires the DCT scaling feature, which is not supported. + +- **jpegtran: Scaling** + This requires both the DCT scaling and SmartScale features, which are not + supported. + +- **Lossless RGB JPEG files** + This requires the SmartScale feature, which is not supported. + +### What About libjpeg v9? + +libjpeg v9 introduced yet another field to the JPEG compression structure +(`color_transform`), thus making the ABI backward incompatible with that of +libjpeg v8. This new field was introduced solely for the purpose of supporting +lossless SmartScale encoding. Furthermore, there was actually no reason to +extend the API in this manner, as the color transform could have just as easily +been activated by way of a new JPEG colorspace constant, thus preserving +backward ABI compatibility. + +Our research (see link above) has shown that lossless SmartScale does not +generally accomplish anything that can't already be accomplished better with +existing, standard lossless formats. Therefore, at this time it is our belief +that there is not sufficient technical justification for software projects to +upgrade from libjpeg v8 to libjpeg v9, and thus there is not sufficient +echnical justification for us to emulate the libjpeg v9 ABI. + +In-Memory Source/Destination Managers +------------------------------------- + +By default, libjpeg-turbo 1.3 and later includes the `jpeg_mem_src()` and +`jpeg_mem_dest()` functions, even when not emulating the libjpeg v8 API/ABI. +Previously, it was necessary to build libjpeg-turbo from source with libjpeg v8 +API/ABI emulation in order to use the in-memory source/destination managers, +but several projects requested that those functions be included when emulating +the libjpeg v6b API/ABI as well. This allows the use of those functions by +programs that need them, without breaking ABI compatibility for programs that +don't, and it allows those functions to be provided in the "official" +libjpeg-turbo binaries. + +Those who are concerned about maintaining strict conformance with the libjpeg +v6b or v7 API can pass an argument of `--without-mem-srcdst` to `configure` or +an argument of `-DWITH_MEM_SRCDST=0` to `cmake` prior to building +libjpeg-turbo. This will restore the pre-1.3 behavior, in which +`jpeg_mem_src()` and `jpeg_mem_dest()` are only included when emulating the +libjpeg v8 API/ABI. + +On Un*x systems, including the in-memory source/destination managers changes +the dynamic library version from 62.0.0 to 62.1.0 if using libjpeg v6b API/ABI +emulation and from 7.0.0 to 7.1.0 if using libjpeg v7 API/ABI emulation. + +Note that, on most Un*x systems, the dynamic linker will not look for a +function in a library until that function is actually used. Thus, if a program +is built against libjpeg-turbo 1.3+ and uses `jpeg_mem_src()` or +`jpeg_mem_dest()`, that program will not fail if run against an older version +of libjpeg-turbo or against libjpeg v7- until the program actually tries to +call `jpeg_mem_src()` or `jpeg_mem_dest()`. Such is not the case on Windows. +If a program is built against the libjpeg-turbo 1.3+ DLL and uses +`jpeg_mem_src()` or `jpeg_mem_dest()`, then it must use the libjpeg-turbo 1.3+ +DLL at run time. + +Both cjpeg and djpeg have been extended to allow testing the in-memory +source/destination manager functions. See their respective man pages for more +details. + + +Mathematical Compatibility +========================== + +For the most part, libjpeg-turbo should produce identical output to libjpeg +v6b. The one exception to this is when using the floating point DCT/IDCT, in +which case the outputs of libjpeg v6b and libjpeg-turbo can differ for the +following reasons: + +- The SSE/SSE2 floating point DCT implementation in libjpeg-turbo is ever so + slightly more accurate than the implementation in libjpeg v6b, but not by + any amount perceptible to human vision (generally in the range of 0.01 to + 0.08 dB gain in PNSR.) + +- When not using the SIMD extensions, libjpeg-turbo uses the more accurate + (and slightly faster) floating point IDCT algorithm introduced in libjpeg + v8a as opposed to the algorithm used in libjpeg v6b. It should be noted, + however, that this algorithm basically brings the accuracy of the floating + point IDCT in line with the accuracy of the slow integer IDCT. The floating + point DCT/IDCT algorithms are mainly a legacy feature, and they do not + produce significantly more accuracy than the slow integer algorithms (to put + numbers on this, the typical difference in PNSR between the two algorithms + is less than 0.10 dB, whereas changing the quality level by 1 in the upper + range of the quality scale is typically more like a 1.0 dB difference.) + +- If the floating point algorithms in libjpeg-turbo are not implemented using + SIMD instructions on a particular platform, then the accuracy of the + floating point DCT/IDCT can depend on the compiler settings. + +While libjpeg-turbo does emulate the libjpeg v8 API/ABI, under the hood it is +still using the same algorithms as libjpeg v6b, so there are several specific +cases in which libjpeg-turbo cannot be expected to produce the same output as +libjpeg v8: + +- When decompressing using scaling factors of 1/2 and 1/4, because libjpeg v8 + implements those scaling algorithms differently than libjpeg v6b does, and + libjpeg-turbo's SIMD extensions are based on the libjpeg v6b behavior. + +- When using chrominance subsampling, because libjpeg v8 implements this + with its DCT/IDCT scaling algorithms rather than with a separate + downsampling/upsampling algorithm. In our testing, the subsampled/upsampled + output of libjpeg v8 is less accurate than that of libjpeg v6b for this + reason. + +- When decompressing using a scaling factor > 1 and merged (AKA "non-fancy" or + "non-smooth") chrominance upsampling, because libjpeg v8 does not support + merged upsampling with scaling factors > 1. + + +Performance Pitfalls +==================== + +Restart Markers +--------------- + +The optimized Huffman decoder in libjpeg-turbo does not handle restart markers +in a way that makes the rest of the libjpeg infrastructure happy, so it is +necessary to use the slow Huffman decoder when decompressing a JPEG image that +has restart markers. This can cause the decompression performance to drop by +as much as 20%, but the performance will still be much greater than that of +libjpeg. Many consumer packages, such as PhotoShop, use restart markers when +generating JPEG images, so images generated by those programs will experience +this issue. + +Fast Integer Forward DCT at High Quality Levels +----------------------------------------------- + +The algorithm used by the SIMD-accelerated quantization function cannot produce +correct results whenever the fast integer forward DCT is used along with a JPEG +quality of 98-100. Thus, libjpeg-turbo must use the non-SIMD quantization +function in those cases. This causes performance to drop by as much as 40%. +It is therefore strongly advised that you use the slow integer forward DCT +whenever encoding images with a JPEG quality of 98 or higher. diff --git a/media/libjpeg/jaricom.c b/media/libjpeg/jaricom.c new file mode 100644 index 000000000..3bb557f7a --- /dev/null +++ b/media/libjpeg/jaricom.c @@ -0,0 +1,156 @@ +/* + * jaricom.c + * + * This file was part of the Independent JPEG Group's software: + * Developed 1997-2009 by Guido Vollbeding. + * libjpeg-turbo Modifications: + * Copyright (C) 2015, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains probability estimation tables for common use in + * arithmetic entropy encoding and decoding routines. + * + * This data represents Table D.2 in the JPEG spec (ISO/IEC IS 10918-1 + * and CCITT Recommendation ITU-T T.81) and Table 24 in the JBIG spec + * (ISO/IEC IS 11544 and CCITT Recommendation ITU-T T.82). + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" + +/* The following #define specifies the packing of the four components + * into the compact JLONG representation. + * Note that this formula must match the actual arithmetic encoder + * and decoder implementation. The implementation has to be changed + * if this formula is changed. + * The current organization is leaned on Markus Kuhn's JBIG + * implementation (jbig_tab.c). + */ + +#define V(i,a,b,c,d) (((JLONG)a << 16) | ((JLONG)c << 8) | ((JLONG)d << 7) | b) + +const JLONG jpeg_aritab[113+1] = { +/* + * Index, Qe_Value, Next_Index_LPS, Next_Index_MPS, Switch_MPS + */ + V( 0, 0x5a1d, 1, 1, 1 ), + V( 1, 0x2586, 14, 2, 0 ), + V( 2, 0x1114, 16, 3, 0 ), + V( 3, 0x080b, 18, 4, 0 ), + V( 4, 0x03d8, 20, 5, 0 ), + V( 5, 0x01da, 23, 6, 0 ), + V( 6, 0x00e5, 25, 7, 0 ), + V( 7, 0x006f, 28, 8, 0 ), + V( 8, 0x0036, 30, 9, 0 ), + V( 9, 0x001a, 33, 10, 0 ), + V( 10, 0x000d, 35, 11, 0 ), + V( 11, 0x0006, 9, 12, 0 ), + V( 12, 0x0003, 10, 13, 0 ), + V( 13, 0x0001, 12, 13, 0 ), + V( 14, 0x5a7f, 15, 15, 1 ), + V( 15, 0x3f25, 36, 16, 0 ), + V( 16, 0x2cf2, 38, 17, 0 ), + V( 17, 0x207c, 39, 18, 0 ), + V( 18, 0x17b9, 40, 19, 0 ), + V( 19, 0x1182, 42, 20, 0 ), + V( 20, 0x0cef, 43, 21, 0 ), + V( 21, 0x09a1, 45, 22, 0 ), + V( 22, 0x072f, 46, 23, 0 ), + V( 23, 0x055c, 48, 24, 0 ), + V( 24, 0x0406, 49, 25, 0 ), + V( 25, 0x0303, 51, 26, 0 ), + V( 26, 0x0240, 52, 27, 0 ), + V( 27, 0x01b1, 54, 28, 0 ), + V( 28, 0x0144, 56, 29, 0 ), + V( 29, 0x00f5, 57, 30, 0 ), + V( 30, 0x00b7, 59, 31, 0 ), + V( 31, 0x008a, 60, 32, 0 ), + V( 32, 0x0068, 62, 33, 0 ), + V( 33, 0x004e, 63, 34, 0 ), + V( 34, 0x003b, 32, 35, 0 ), + V( 35, 0x002c, 33, 9, 0 ), + V( 36, 0x5ae1, 37, 37, 1 ), + V( 37, 0x484c, 64, 38, 0 ), + V( 38, 0x3a0d, 65, 39, 0 ), + V( 39, 0x2ef1, 67, 40, 0 ), + V( 40, 0x261f, 68, 41, 0 ), + V( 41, 0x1f33, 69, 42, 0 ), + V( 42, 0x19a8, 70, 43, 0 ), + V( 43, 0x1518, 72, 44, 0 ), + V( 44, 0x1177, 73, 45, 0 ), + V( 45, 0x0e74, 74, 46, 0 ), + V( 46, 0x0bfb, 75, 47, 0 ), + V( 47, 0x09f8, 77, 48, 0 ), + V( 48, 0x0861, 78, 49, 0 ), + V( 49, 0x0706, 79, 50, 0 ), + V( 50, 0x05cd, 48, 51, 0 ), + V( 51, 0x04de, 50, 52, 0 ), + V( 52, 0x040f, 50, 53, 0 ), + V( 53, 0x0363, 51, 54, 0 ), + V( 54, 0x02d4, 52, 55, 0 ), + V( 55, 0x025c, 53, 56, 0 ), + V( 56, 0x01f8, 54, 57, 0 ), + V( 57, 0x01a4, 55, 58, 0 ), + V( 58, 0x0160, 56, 59, 0 ), + V( 59, 0x0125, 57, 60, 0 ), + V( 60, 0x00f6, 58, 61, 0 ), + V( 61, 0x00cb, 59, 62, 0 ), + V( 62, 0x00ab, 61, 63, 0 ), + V( 63, 0x008f, 61, 32, 0 ), + V( 64, 0x5b12, 65, 65, 1 ), + V( 65, 0x4d04, 80, 66, 0 ), + V( 66, 0x412c, 81, 67, 0 ), + V( 67, 0x37d8, 82, 68, 0 ), + V( 68, 0x2fe8, 83, 69, 0 ), + V( 69, 0x293c, 84, 70, 0 ), + V( 70, 0x2379, 86, 71, 0 ), + V( 71, 0x1edf, 87, 72, 0 ), + V( 72, 0x1aa9, 87, 73, 0 ), + V( 73, 0x174e, 72, 74, 0 ), + V( 74, 0x1424, 72, 75, 0 ), + V( 75, 0x119c, 74, 76, 0 ), + V( 76, 0x0f6b, 74, 77, 0 ), + V( 77, 0x0d51, 75, 78, 0 ), + V( 78, 0x0bb6, 77, 79, 0 ), + V( 79, 0x0a40, 77, 48, 0 ), + V( 80, 0x5832, 80, 81, 1 ), + V( 81, 0x4d1c, 88, 82, 0 ), + V( 82, 0x438e, 89, 83, 0 ), + V( 83, 0x3bdd, 90, 84, 0 ), + V( 84, 0x34ee, 91, 85, 0 ), + V( 85, 0x2eae, 92, 86, 0 ), + V( 86, 0x299a, 93, 87, 0 ), + V( 87, 0x2516, 86, 71, 0 ), + V( 88, 0x5570, 88, 89, 1 ), + V( 89, 0x4ca9, 95, 90, 0 ), + V( 90, 0x44d9, 96, 91, 0 ), + V( 91, 0x3e22, 97, 92, 0 ), + V( 92, 0x3824, 99, 93, 0 ), + V( 93, 0x32b4, 99, 94, 0 ), + V( 94, 0x2e17, 93, 86, 0 ), + V( 95, 0x56a8, 95, 96, 1 ), + V( 96, 0x4f46, 101, 97, 0 ), + V( 97, 0x47e5, 102, 98, 0 ), + V( 98, 0x41cf, 103, 99, 0 ), + V( 99, 0x3c3d, 104, 100, 0 ), + V( 100, 0x375e, 99, 93, 0 ), + V( 101, 0x5231, 105, 102, 0 ), + V( 102, 0x4c0f, 106, 103, 0 ), + V( 103, 0x4639, 107, 104, 0 ), + V( 104, 0x415e, 103, 99, 0 ), + V( 105, 0x5627, 105, 106, 1 ), + V( 106, 0x50e7, 108, 107, 0 ), + V( 107, 0x4b85, 109, 103, 0 ), + V( 108, 0x5597, 110, 109, 0 ), + V( 109, 0x504f, 111, 107, 0 ), + V( 110, 0x5a10, 110, 111, 1 ), + V( 111, 0x5522, 112, 109, 0 ), + V( 112, 0x59eb, 112, 111, 1 ), +/* + * This last entry is used for fixed probability estimate of 0.5 + * as recommended in Section 10.3 Table 5 of ITU-T Rec. T.851. + */ + V( 113, 0x5a1d, 113, 113, 0 ) +}; diff --git a/media/libjpeg/jcapimin.c b/media/libjpeg/jcapimin.c new file mode 100644 index 000000000..15674be54 --- /dev/null +++ b/media/libjpeg/jcapimin.c @@ -0,0 +1,295 @@ +/* + * jcapimin.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1994-1998, Thomas G. Lane. + * Modified 2003-2010 by Guido Vollbeding. + * It was modified by The libjpeg-turbo Project to include only code relevant + * to libjpeg-turbo. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains application interface code for the compression half + * of the JPEG library. These are the "minimum" API routines that may be + * needed in either the normal full-compression case or the transcoding-only + * case. + * + * Most of the routines intended to be called directly by an application + * are in this file or in jcapistd.c. But also see jcparam.c for + * parameter-setup helper routines, jcomapi.c for routines shared by + * compression and decompression, and jctrans.c for the transcoding case. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" + + +/* + * Initialization of a JPEG compression object. + * The error manager must already be set up (in case memory manager fails). + */ + +GLOBAL(void) +jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize) +{ + int i; + + /* Guard against version mismatches between library and caller. */ + cinfo->mem = NULL; /* so jpeg_destroy knows mem mgr not called */ + if (version != JPEG_LIB_VERSION) + ERREXIT2(cinfo, JERR_BAD_LIB_VERSION, JPEG_LIB_VERSION, version); + if (structsize != sizeof(struct jpeg_compress_struct)) + ERREXIT2(cinfo, JERR_BAD_STRUCT_SIZE, + (int) sizeof(struct jpeg_compress_struct), (int) structsize); + + /* For debugging purposes, we zero the whole master structure. + * But the application has already set the err pointer, and may have set + * client_data, so we have to save and restore those fields. + * Note: if application hasn't set client_data, tools like Purify may + * complain here. + */ + { + struct jpeg_error_mgr *err = cinfo->err; + void *client_data = cinfo->client_data; /* ignore Purify complaint here */ + MEMZERO(cinfo, sizeof(struct jpeg_compress_struct)); + cinfo->err = err; + cinfo->client_data = client_data; + } + cinfo->is_decompressor = FALSE; + + /* Initialize a memory manager instance for this object */ + jinit_memory_mgr((j_common_ptr) cinfo); + + /* Zero out pointers to permanent structures. */ + cinfo->progress = NULL; + cinfo->dest = NULL; + + cinfo->comp_info = NULL; + + for (i = 0; i < NUM_QUANT_TBLS; i++) { + cinfo->quant_tbl_ptrs[i] = NULL; +#if JPEG_LIB_VERSION >= 70 + cinfo->q_scale_factor[i] = 100; +#endif + } + + for (i = 0; i < NUM_HUFF_TBLS; i++) { + cinfo->dc_huff_tbl_ptrs[i] = NULL; + cinfo->ac_huff_tbl_ptrs[i] = NULL; + } + +#if JPEG_LIB_VERSION >= 80 + /* Must do it here for emit_dqt in case jpeg_write_tables is used */ + cinfo->block_size = DCTSIZE; + cinfo->natural_order = jpeg_natural_order; + cinfo->lim_Se = DCTSIZE2-1; +#endif + + cinfo->script_space = NULL; + + cinfo->input_gamma = 1.0; /* in case application forgets */ + + /* OK, I'm ready */ + cinfo->global_state = CSTATE_START; +} + + +/* + * Destruction of a JPEG compression object + */ + +GLOBAL(void) +jpeg_destroy_compress (j_compress_ptr cinfo) +{ + jpeg_destroy((j_common_ptr) cinfo); /* use common routine */ +} + + +/* + * Abort processing of a JPEG compression operation, + * but don't destroy the object itself. + */ + +GLOBAL(void) +jpeg_abort_compress (j_compress_ptr cinfo) +{ + jpeg_abort((j_common_ptr) cinfo); /* use common routine */ +} + + +/* + * Forcibly suppress or un-suppress all quantization and Huffman tables. + * Marks all currently defined tables as already written (if suppress) + * or not written (if !suppress). This will control whether they get emitted + * by a subsequent jpeg_start_compress call. + * + * This routine is exported for use by applications that want to produce + * abbreviated JPEG datastreams. It logically belongs in jcparam.c, but + * since it is called by jpeg_start_compress, we put it here --- otherwise + * jcparam.o would be linked whether the application used it or not. + */ + +GLOBAL(void) +jpeg_suppress_tables (j_compress_ptr cinfo, boolean suppress) +{ + int i; + JQUANT_TBL *qtbl; + JHUFF_TBL *htbl; + + for (i = 0; i < NUM_QUANT_TBLS; i++) { + if ((qtbl = cinfo->quant_tbl_ptrs[i]) != NULL) + qtbl->sent_table = suppress; + } + + for (i = 0; i < NUM_HUFF_TBLS; i++) { + if ((htbl = cinfo->dc_huff_tbl_ptrs[i]) != NULL) + htbl->sent_table = suppress; + if ((htbl = cinfo->ac_huff_tbl_ptrs[i]) != NULL) + htbl->sent_table = suppress; + } +} + + +/* + * Finish JPEG compression. + * + * If a multipass operating mode was selected, this may do a great deal of + * work including most of the actual output. + */ + +GLOBAL(void) +jpeg_finish_compress (j_compress_ptr cinfo) +{ + JDIMENSION iMCU_row; + + if (cinfo->global_state == CSTATE_SCANNING || + cinfo->global_state == CSTATE_RAW_OK) { + /* Terminate first pass */ + if (cinfo->next_scanline < cinfo->image_height) + ERREXIT(cinfo, JERR_TOO_LITTLE_DATA); + (*cinfo->master->finish_pass) (cinfo); + } else if (cinfo->global_state != CSTATE_WRCOEFS) + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + /* Perform any remaining passes */ + while (! cinfo->master->is_last_pass) { + (*cinfo->master->prepare_for_pass) (cinfo); + for (iMCU_row = 0; iMCU_row < cinfo->total_iMCU_rows; iMCU_row++) { + if (cinfo->progress != NULL) { + cinfo->progress->pass_counter = (long) iMCU_row; + cinfo->progress->pass_limit = (long) cinfo->total_iMCU_rows; + (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo); + } + /* We bypass the main controller and invoke coef controller directly; + * all work is being done from the coefficient buffer. + */ + if (! (*cinfo->coef->compress_data) (cinfo, (JSAMPIMAGE) NULL)) + ERREXIT(cinfo, JERR_CANT_SUSPEND); + } + (*cinfo->master->finish_pass) (cinfo); + } + /* Write EOI, do final cleanup */ + (*cinfo->marker->write_file_trailer) (cinfo); + (*cinfo->dest->term_destination) (cinfo); + /* We can use jpeg_abort to release memory and reset global_state */ + jpeg_abort((j_common_ptr) cinfo); +} + + +/* + * Write a special marker. + * This is only recommended for writing COM or APPn markers. + * Must be called after jpeg_start_compress() and before + * first call to jpeg_write_scanlines() or jpeg_write_raw_data(). + */ + +GLOBAL(void) +jpeg_write_marker (j_compress_ptr cinfo, int marker, + const JOCTET *dataptr, unsigned int datalen) +{ + void (*write_marker_byte) (j_compress_ptr info, int val); + + if (cinfo->next_scanline != 0 || + (cinfo->global_state != CSTATE_SCANNING && + cinfo->global_state != CSTATE_RAW_OK && + cinfo->global_state != CSTATE_WRCOEFS)) + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + + (*cinfo->marker->write_marker_header) (cinfo, marker, datalen); + write_marker_byte = cinfo->marker->write_marker_byte; /* copy for speed */ + while (datalen--) { + (*write_marker_byte) (cinfo, *dataptr); + dataptr++; + } +} + +/* Same, but piecemeal. */ + +GLOBAL(void) +jpeg_write_m_header (j_compress_ptr cinfo, int marker, unsigned int datalen) +{ + if (cinfo->next_scanline != 0 || + (cinfo->global_state != CSTATE_SCANNING && + cinfo->global_state != CSTATE_RAW_OK && + cinfo->global_state != CSTATE_WRCOEFS)) + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + + (*cinfo->marker->write_marker_header) (cinfo, marker, datalen); +} + +GLOBAL(void) +jpeg_write_m_byte (j_compress_ptr cinfo, int val) +{ + (*cinfo->marker->write_marker_byte) (cinfo, val); +} + + +/* + * Alternate compression function: just write an abbreviated table file. + * Before calling this, all parameters and a data destination must be set up. + * + * To produce a pair of files containing abbreviated tables and abbreviated + * image data, one would proceed as follows: + * + * initialize JPEG object + * set JPEG parameters + * set destination to table file + * jpeg_write_tables(cinfo); + * set destination to image file + * jpeg_start_compress(cinfo, FALSE); + * write data... + * jpeg_finish_compress(cinfo); + * + * jpeg_write_tables has the side effect of marking all tables written + * (same as jpeg_suppress_tables(..., TRUE)). Thus a subsequent start_compress + * will not re-emit the tables unless it is passed write_all_tables=TRUE. + */ + +GLOBAL(void) +jpeg_write_tables (j_compress_ptr cinfo) +{ + if (cinfo->global_state != CSTATE_START) + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + + /* (Re)initialize error mgr and destination modules */ + (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo); + (*cinfo->dest->init_destination) (cinfo); + /* Initialize the marker writer ... bit of a crock to do it here. */ + jinit_marker_writer(cinfo); + /* Write them tables! */ + (*cinfo->marker->write_tables_only) (cinfo); + /* And clean up. */ + (*cinfo->dest->term_destination) (cinfo); + /* + * In library releases up through v6a, we called jpeg_abort() here to free + * any working memory allocated by the destination manager and marker + * writer. Some applications had a problem with that: they allocated space + * of their own from the library memory manager, and didn't want it to go + * away during write_tables. So now we do nothing. This will cause a + * memory leak if an app calls write_tables repeatedly without doing a full + * compression cycle or otherwise resetting the JPEG object. However, that + * seems less bad than unexpectedly freeing memory in the normal case. + * An app that prefers the old behavior can call jpeg_abort for itself after + * each call to jpeg_write_tables(). + */ +} diff --git a/media/libjpeg/jcapistd.c b/media/libjpeg/jcapistd.c new file mode 100644 index 000000000..5c6d0be25 --- /dev/null +++ b/media/libjpeg/jcapistd.c @@ -0,0 +1,162 @@ +/* + * jcapistd.c + * + * Copyright (C) 1994-1996, Thomas G. Lane. + * This file is part of the Independent JPEG Group's software. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains application interface code for the compression half + * of the JPEG library. These are the "standard" API routines that are + * used in the normal full-compression case. They are not used by a + * transcoding-only application. Note that if an application links in + * jpeg_start_compress, it will end up linking in the entire compressor. + * We thus must separate this file from jcapimin.c to avoid linking the + * whole compression library into a transcoder. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" + + +/* + * Compression initialization. + * Before calling this, all parameters and a data destination must be set up. + * + * We require a write_all_tables parameter as a failsafe check when writing + * multiple datastreams from the same compression object. Since prior runs + * will have left all the tables marked sent_table=TRUE, a subsequent run + * would emit an abbreviated stream (no tables) by default. This may be what + * is wanted, but for safety's sake it should not be the default behavior: + * programmers should have to make a deliberate choice to emit abbreviated + * images. Therefore the documentation and examples should encourage people + * to pass write_all_tables=TRUE; then it will take active thought to do the + * wrong thing. + */ + +GLOBAL(void) +jpeg_start_compress (j_compress_ptr cinfo, boolean write_all_tables) +{ + if (cinfo->global_state != CSTATE_START) + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + + if (write_all_tables) + jpeg_suppress_tables(cinfo, FALSE); /* mark all tables to be written */ + + /* (Re)initialize error mgr and destination modules */ + (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo); + (*cinfo->dest->init_destination) (cinfo); + /* Perform master selection of active modules */ + jinit_compress_master(cinfo); + /* Set up for the first pass */ + (*cinfo->master->prepare_for_pass) (cinfo); + /* Ready for application to drive first pass through jpeg_write_scanlines + * or jpeg_write_raw_data. + */ + cinfo->next_scanline = 0; + cinfo->global_state = (cinfo->raw_data_in ? CSTATE_RAW_OK : CSTATE_SCANNING); +} + + +/* + * Write some scanlines of data to the JPEG compressor. + * + * The return value will be the number of lines actually written. + * This should be less than the supplied num_lines only in case that + * the data destination module has requested suspension of the compressor, + * or if more than image_height scanlines are passed in. + * + * Note: we warn about excess calls to jpeg_write_scanlines() since + * this likely signals an application programmer error. However, + * excess scanlines passed in the last valid call are *silently* ignored, + * so that the application need not adjust num_lines for end-of-image + * when using a multiple-scanline buffer. + */ + +GLOBAL(JDIMENSION) +jpeg_write_scanlines (j_compress_ptr cinfo, JSAMPARRAY scanlines, + JDIMENSION num_lines) +{ + JDIMENSION row_ctr, rows_left; + + if (cinfo->global_state != CSTATE_SCANNING) + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + if (cinfo->next_scanline >= cinfo->image_height) + WARNMS(cinfo, JWRN_TOO_MUCH_DATA); + + /* Call progress monitor hook if present */ + if (cinfo->progress != NULL) { + cinfo->progress->pass_counter = (long) cinfo->next_scanline; + cinfo->progress->pass_limit = (long) cinfo->image_height; + (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo); + } + + /* Give master control module another chance if this is first call to + * jpeg_write_scanlines. This lets output of the frame/scan headers be + * delayed so that application can write COM, etc, markers between + * jpeg_start_compress and jpeg_write_scanlines. + */ + if (cinfo->master->call_pass_startup) + (*cinfo->master->pass_startup) (cinfo); + + /* Ignore any extra scanlines at bottom of image. */ + rows_left = cinfo->image_height - cinfo->next_scanline; + if (num_lines > rows_left) + num_lines = rows_left; + + row_ctr = 0; + (*cinfo->main->process_data) (cinfo, scanlines, &row_ctr, num_lines); + cinfo->next_scanline += row_ctr; + return row_ctr; +} + + +/* + * Alternate entry point to write raw data. + * Processes exactly one iMCU row per call, unless suspended. + */ + +GLOBAL(JDIMENSION) +jpeg_write_raw_data (j_compress_ptr cinfo, JSAMPIMAGE data, + JDIMENSION num_lines) +{ + JDIMENSION lines_per_iMCU_row; + + if (cinfo->global_state != CSTATE_RAW_OK) + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + if (cinfo->next_scanline >= cinfo->image_height) { + WARNMS(cinfo, JWRN_TOO_MUCH_DATA); + return 0; + } + + /* Call progress monitor hook if present */ + if (cinfo->progress != NULL) { + cinfo->progress->pass_counter = (long) cinfo->next_scanline; + cinfo->progress->pass_limit = (long) cinfo->image_height; + (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo); + } + + /* Give master control module another chance if this is first call to + * jpeg_write_raw_data. This lets output of the frame/scan headers be + * delayed so that application can write COM, etc, markers between + * jpeg_start_compress and jpeg_write_raw_data. + */ + if (cinfo->master->call_pass_startup) + (*cinfo->master->pass_startup) (cinfo); + + /* Verify that at least one iMCU row has been passed. */ + lines_per_iMCU_row = cinfo->max_v_samp_factor * DCTSIZE; + if (num_lines < lines_per_iMCU_row) + ERREXIT(cinfo, JERR_BUFFER_SIZE); + + /* Directly compress the row. */ + if (! (*cinfo->coef->compress_data) (cinfo, data)) { + /* If compressor did not consume the whole row, suspend processing. */ + return 0; + } + + /* OK, we processed one iMCU row. */ + cinfo->next_scanline += lines_per_iMCU_row; + return lines_per_iMCU_row; +} diff --git a/media/libjpeg/jcarith.c b/media/libjpeg/jcarith.c new file mode 100644 index 000000000..6d3b8af5b --- /dev/null +++ b/media/libjpeg/jcarith.c @@ -0,0 +1,928 @@ +/* + * jcarith.c + * + * This file was part of the Independent JPEG Group's software: + * Developed 1997-2009 by Guido Vollbeding. + * libjpeg-turbo Modifications: + * Copyright (C) 2015, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains portable arithmetic entropy encoding routines for JPEG + * (implementing the ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81). + * + * Both sequential and progressive modes are supported in this single module. + * + * Suspension is not currently supported in this module. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" + + +/* Expanded entropy encoder object for arithmetic encoding. */ + +typedef struct { + struct jpeg_entropy_encoder pub; /* public fields */ + + JLONG c; /* C register, base of coding interval, layout as in sec. D.1.3 */ + JLONG a; /* A register, normalized size of coding interval */ + JLONG sc; /* counter for stacked 0xFF values which might overflow */ + JLONG zc; /* counter for pending 0x00 output values which might * + * be discarded at the end ("Pacman" termination) */ + int ct; /* bit shift counter, determines when next byte will be written */ + int buffer; /* buffer for most recent output byte != 0xFF */ + + int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */ + int dc_context[MAX_COMPS_IN_SCAN]; /* context index for DC conditioning */ + + unsigned int restarts_to_go; /* MCUs left in this restart interval */ + int next_restart_num; /* next restart number to write (0-7) */ + + /* Pointers to statistics areas (these workspaces have image lifespan) */ + unsigned char *dc_stats[NUM_ARITH_TBLS]; + unsigned char *ac_stats[NUM_ARITH_TBLS]; + + /* Statistics bin for coding with fixed probability 0.5 */ + unsigned char fixed_bin[4]; +} arith_entropy_encoder; + +typedef arith_entropy_encoder *arith_entropy_ptr; + +/* The following two definitions specify the allocation chunk size + * for the statistics area. + * According to sections F.1.4.4.1.3 and F.1.4.4.2, we need at least + * 49 statistics bins for DC, and 245 statistics bins for AC coding. + * + * We use a compact representation with 1 byte per statistics bin, + * thus the numbers directly represent byte sizes. + * This 1 byte per statistics bin contains the meaning of the MPS + * (more probable symbol) in the highest bit (mask 0x80), and the + * index into the probability estimation state machine table + * in the lower bits (mask 0x7F). + */ + +#define DC_STAT_BINS 64 +#define AC_STAT_BINS 256 + +/* NOTE: Uncomment the following #define if you want to use the + * given formula for calculating the AC conditioning parameter Kx + * for spectral selection progressive coding in section G.1.3.2 + * of the spec (Kx = Kmin + SRL (8 + Se - Kmin) 4). + * Although the spec and P&M authors claim that this "has proven + * to give good results for 8 bit precision samples", I'm not + * convinced yet that this is really beneficial. + * Early tests gave only very marginal compression enhancements + * (a few - around 5 or so - bytes even for very large files), + * which would turn out rather negative if we'd suppress the + * DAC (Define Arithmetic Conditioning) marker segments for + * the default parameters in the future. + * Note that currently the marker writing module emits 12-byte + * DAC segments for a full-component scan in a color image. + * This is not worth worrying about IMHO. However, since the + * spec defines the default values to be used if the tables + * are omitted (unlike Huffman tables, which are required + * anyway), one might optimize this behaviour in the future, + * and then it would be disadvantageous to use custom tables if + * they don't provide sufficient gain to exceed the DAC size. + * + * On the other hand, I'd consider it as a reasonable result + * that the conditioning has no significant influence on the + * compression performance. This means that the basic + * statistical model is already rather stable. + * + * Thus, at the moment, we use the default conditioning values + * anyway, and do not use the custom formula. + * +#define CALCULATE_SPECTRAL_CONDITIONING + */ + +/* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than JLONG. + * We assume that int right shift is unsigned if JLONG right shift is, + * which should be safe. + */ + +#ifdef RIGHT_SHIFT_IS_UNSIGNED +#define ISHIFT_TEMPS int ishift_temp; +#define IRIGHT_SHIFT(x,shft) \ + ((ishift_temp = (x)) < 0 ? \ + (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \ + (ishift_temp >> (shft))) +#else +#define ISHIFT_TEMPS +#define IRIGHT_SHIFT(x,shft) ((x) >> (shft)) +#endif + + +LOCAL(void) +emit_byte (int val, j_compress_ptr cinfo) +/* Write next output byte; we do not support suspension in this module. */ +{ + struct jpeg_destination_mgr *dest = cinfo->dest; + + *dest->next_output_byte++ = (JOCTET) val; + if (--dest->free_in_buffer == 0) + if (! (*dest->empty_output_buffer) (cinfo)) + ERREXIT(cinfo, JERR_CANT_SUSPEND); +} + + +/* + * Finish up at the end of an arithmetic-compressed scan. + */ + +METHODDEF(void) +finish_pass (j_compress_ptr cinfo) +{ + arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy; + JLONG temp; + + /* Section D.1.8: Termination of encoding */ + + /* Find the e->c in the coding interval with the largest + * number of trailing zero bits */ + if ((temp = (e->a - 1 + e->c) & 0xFFFF0000L) < e->c) + e->c = temp + 0x8000L; + else + e->c = temp; + /* Send remaining bytes to output */ + e->c <<= e->ct; + if (e->c & 0xF8000000L) { + /* One final overflow has to be handled */ + if (e->buffer >= 0) { + if (e->zc) + do emit_byte(0x00, cinfo); + while (--e->zc); + emit_byte(e->buffer + 1, cinfo); + if (e->buffer + 1 == 0xFF) + emit_byte(0x00, cinfo); + } + e->zc += e->sc; /* carry-over converts stacked 0xFF bytes to 0x00 */ + e->sc = 0; + } else { + if (e->buffer == 0) + ++e->zc; + else if (e->buffer >= 0) { + if (e->zc) + do emit_byte(0x00, cinfo); + while (--e->zc); + emit_byte(e->buffer, cinfo); + } + if (e->sc) { + if (e->zc) + do emit_byte(0x00, cinfo); + while (--e->zc); + do { + emit_byte(0xFF, cinfo); + emit_byte(0x00, cinfo); + } while (--e->sc); + } + } + /* Output final bytes only if they are not 0x00 */ + if (e->c & 0x7FFF800L) { + if (e->zc) /* output final pending zero bytes */ + do emit_byte(0x00, cinfo); + while (--e->zc); + emit_byte((e->c >> 19) & 0xFF, cinfo); + if (((e->c >> 19) & 0xFF) == 0xFF) + emit_byte(0x00, cinfo); + if (e->c & 0x7F800L) { + emit_byte((e->c >> 11) & 0xFF, cinfo); + if (((e->c >> 11) & 0xFF) == 0xFF) + emit_byte(0x00, cinfo); + } + } +} + + +/* + * The core arithmetic encoding routine (common in JPEG and JBIG). + * This needs to go as fast as possible. + * Machine-dependent optimization facilities + * are not utilized in this portable implementation. + * However, this code should be fairly efficient and + * may be a good base for further optimizations anyway. + * + * Parameter 'val' to be encoded may be 0 or 1 (binary decision). + * + * Note: I've added full "Pacman" termination support to the + * byte output routines, which is equivalent to the optional + * Discard_final_zeros procedure (Figure D.15) in the spec. + * Thus, we always produce the shortest possible output + * stream compliant to the spec (no trailing zero bytes, + * except for FF stuffing). + * + * I've also introduced a new scheme for accessing + * the probability estimation state machine table, + * derived from Markus Kuhn's JBIG implementation. + */ + +LOCAL(void) +arith_encode (j_compress_ptr cinfo, unsigned char *st, int val) +{ + register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy; + register unsigned char nl, nm; + register JLONG qe, temp; + register int sv; + + /* Fetch values from our compact representation of Table D.2: + * Qe values and probability estimation state machine + */ + sv = *st; + qe = jpeg_aritab[sv & 0x7F]; /* => Qe_Value */ + nl = qe & 0xFF; qe >>= 8; /* Next_Index_LPS + Switch_MPS */ + nm = qe & 0xFF; qe >>= 8; /* Next_Index_MPS */ + + /* Encode & estimation procedures per sections D.1.4 & D.1.5 */ + e->a -= qe; + if (val != (sv >> 7)) { + /* Encode the less probable symbol */ + if (e->a >= qe) { + /* If the interval size (qe) for the less probable symbol (LPS) + * is larger than the interval size for the MPS, then exchange + * the two symbols for coding efficiency, otherwise code the LPS + * as usual: */ + e->c += e->a; + e->a = qe; + } + *st = (sv & 0x80) ^ nl; /* Estimate_after_LPS */ + } else { + /* Encode the more probable symbol */ + if (e->a >= 0x8000L) + return; /* A >= 0x8000 -> ready, no renormalization required */ + if (e->a < qe) { + /* If the interval size (qe) for the less probable symbol (LPS) + * is larger than the interval size for the MPS, then exchange + * the two symbols for coding efficiency: */ + e->c += e->a; + e->a = qe; + } + *st = (sv & 0x80) ^ nm; /* Estimate_after_MPS */ + } + + /* Renormalization & data output per section D.1.6 */ + do { + e->a <<= 1; + e->c <<= 1; + if (--e->ct == 0) { + /* Another byte is ready for output */ + temp = e->c >> 19; + if (temp > 0xFF) { + /* Handle overflow over all stacked 0xFF bytes */ + if (e->buffer >= 0) { + if (e->zc) + do emit_byte(0x00, cinfo); + while (--e->zc); + emit_byte(e->buffer + 1, cinfo); + if (e->buffer + 1 == 0xFF) + emit_byte(0x00, cinfo); + } + e->zc += e->sc; /* carry-over converts stacked 0xFF bytes to 0x00 */ + e->sc = 0; + /* Note: The 3 spacer bits in the C register guarantee + * that the new buffer byte can't be 0xFF here + * (see page 160 in the P&M JPEG book). */ + e->buffer = temp & 0xFF; /* new output byte, might overflow later */ + } else if (temp == 0xFF) { + ++e->sc; /* stack 0xFF byte (which might overflow later) */ + } else { + /* Output all stacked 0xFF bytes, they will not overflow any more */ + if (e->buffer == 0) + ++e->zc; + else if (e->buffer >= 0) { + if (e->zc) + do emit_byte(0x00, cinfo); + while (--e->zc); + emit_byte(e->buffer, cinfo); + } + if (e->sc) { + if (e->zc) + do emit_byte(0x00, cinfo); + while (--e->zc); + do { + emit_byte(0xFF, cinfo); + emit_byte(0x00, cinfo); + } while (--e->sc); + } + e->buffer = temp & 0xFF; /* new output byte (can still overflow) */ + } + e->c &= 0x7FFFFL; + e->ct += 8; + } + } while (e->a < 0x8000L); +} + + +/* + * Emit a restart marker & resynchronize predictions. + */ + +LOCAL(void) +emit_restart (j_compress_ptr cinfo, int restart_num) +{ + arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy; + int ci; + jpeg_component_info *compptr; + + finish_pass(cinfo); + + emit_byte(0xFF, cinfo); + emit_byte(JPEG_RST0 + restart_num, cinfo); + + /* Re-initialize statistics areas */ + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { + compptr = cinfo->cur_comp_info[ci]; + /* DC needs no table for refinement scan */ + if (cinfo->progressive_mode == 0 || (cinfo->Ss == 0 && cinfo->Ah == 0)) { + MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS); + /* Reset DC predictions to 0 */ + entropy->last_dc_val[ci] = 0; + entropy->dc_context[ci] = 0; + } + /* AC needs no table when not present */ + if (cinfo->progressive_mode == 0 || cinfo->Se) { + MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS); + } + } + + /* Reset arithmetic encoding variables */ + entropy->c = 0; + entropy->a = 0x10000L; + entropy->sc = 0; + entropy->zc = 0; + entropy->ct = 11; + entropy->buffer = -1; /* empty */ +} + + +/* + * MCU encoding for DC initial scan (either spectral selection, + * or first pass of successive approximation). + */ + +METHODDEF(boolean) +encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data) +{ + arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy; + JBLOCKROW block; + unsigned char *st; + int blkn, ci, tbl; + int v, v2, m; + ISHIFT_TEMPS + + /* Emit restart marker if needed */ + if (cinfo->restart_interval) { + if (entropy->restarts_to_go == 0) { + emit_restart(cinfo, entropy->next_restart_num); + entropy->restarts_to_go = cinfo->restart_interval; + entropy->next_restart_num++; + entropy->next_restart_num &= 7; + } + entropy->restarts_to_go--; + } + + /* Encode the MCU data blocks */ + for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) { + block = MCU_data[blkn]; + ci = cinfo->MCU_membership[blkn]; + tbl = cinfo->cur_comp_info[ci]->dc_tbl_no; + + /* Compute the DC value after the required point transform by Al. + * This is simply an arithmetic right shift. + */ + m = IRIGHT_SHIFT((int) ((*block)[0]), cinfo->Al); + + /* Sections F.1.4.1 & F.1.4.4.1: Encoding of DC coefficients */ + + /* Table F.4: Point to statistics bin S0 for DC coefficient coding */ + st = entropy->dc_stats[tbl] + entropy->dc_context[ci]; + + /* Figure F.4: Encode_DC_DIFF */ + if ((v = m - entropy->last_dc_val[ci]) == 0) { + arith_encode(cinfo, st, 0); + entropy->dc_context[ci] = 0; /* zero diff category */ + } else { + entropy->last_dc_val[ci] = m; + arith_encode(cinfo, st, 1); + /* Figure F.6: Encoding nonzero value v */ + /* Figure F.7: Encoding the sign of v */ + if (v > 0) { + arith_encode(cinfo, st + 1, 0); /* Table F.4: SS = S0 + 1 */ + st += 2; /* Table F.4: SP = S0 + 2 */ + entropy->dc_context[ci] = 4; /* small positive diff category */ + } else { + v = -v; + arith_encode(cinfo, st + 1, 1); /* Table F.4: SS = S0 + 1 */ + st += 3; /* Table F.4: SN = S0 + 3 */ + entropy->dc_context[ci] = 8; /* small negative diff category */ + } + /* Figure F.8: Encoding the magnitude category of v */ + m = 0; + if (v -= 1) { + arith_encode(cinfo, st, 1); + m = 1; + v2 = v; + st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */ + while (v2 >>= 1) { + arith_encode(cinfo, st, 1); + m <<= 1; + st += 1; + } + } + arith_encode(cinfo, st, 0); + /* Section F.1.4.4.1.2: Establish dc_context conditioning category */ + if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1)) + entropy->dc_context[ci] = 0; /* zero diff category */ + else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1)) + entropy->dc_context[ci] += 8; /* large diff category */ + /* Figure F.9: Encoding the magnitude bit pattern of v */ + st += 14; + while (m >>= 1) + arith_encode(cinfo, st, (m & v) ? 1 : 0); + } + } + + return TRUE; +} + + +/* + * MCU encoding for AC initial scan (either spectral selection, + * or first pass of successive approximation). + */ + +METHODDEF(boolean) +encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data) +{ + arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy; + JBLOCKROW block; + unsigned char *st; + int tbl, k, ke; + int v, v2, m; + + /* Emit restart marker if needed */ + if (cinfo->restart_interval) { + if (entropy->restarts_to_go == 0) { + emit_restart(cinfo, entropy->next_restart_num); + entropy->restarts_to_go = cinfo->restart_interval; + entropy->next_restart_num++; + entropy->next_restart_num &= 7; + } + entropy->restarts_to_go--; + } + + /* Encode the MCU data block */ + block = MCU_data[0]; + tbl = cinfo->cur_comp_info[0]->ac_tbl_no; + + /* Sections F.1.4.2 & F.1.4.4.2: Encoding of AC coefficients */ + + /* Establish EOB (end-of-block) index */ + for (ke = cinfo->Se; ke > 0; ke--) + /* We must apply the point transform by Al. For AC coefficients this + * is an integer division with rounding towards 0. To do this portably + * in C, we shift after obtaining the absolute value. + */ + if ((v = (*block)[jpeg_natural_order[ke]]) >= 0) { + if (v >>= cinfo->Al) break; + } else { + v = -v; + if (v >>= cinfo->Al) break; + } + + /* Figure F.5: Encode_AC_Coefficients */ + for (k = cinfo->Ss; k <= ke; k++) { + st = entropy->ac_stats[tbl] + 3 * (k - 1); + arith_encode(cinfo, st, 0); /* EOB decision */ + for (;;) { + if ((v = (*block)[jpeg_natural_order[k]]) >= 0) { + if (v >>= cinfo->Al) { + arith_encode(cinfo, st + 1, 1); + arith_encode(cinfo, entropy->fixed_bin, 0); + break; + } + } else { + v = -v; + if (v >>= cinfo->Al) { + arith_encode(cinfo, st + 1, 1); + arith_encode(cinfo, entropy->fixed_bin, 1); + break; + } + } + arith_encode(cinfo, st + 1, 0); st += 3; k++; + } + st += 2; + /* Figure F.8: Encoding the magnitude category of v */ + m = 0; + if (v -= 1) { + arith_encode(cinfo, st, 1); + m = 1; + v2 = v; + if (v2 >>= 1) { + arith_encode(cinfo, st, 1); + m <<= 1; + st = entropy->ac_stats[tbl] + + (k <= cinfo->arith_ac_K[tbl] ? 189 : 217); + while (v2 >>= 1) { + arith_encode(cinfo, st, 1); + m <<= 1; + st += 1; + } + } + } + arith_encode(cinfo, st, 0); + /* Figure F.9: Encoding the magnitude bit pattern of v */ + st += 14; + while (m >>= 1) + arith_encode(cinfo, st, (m & v) ? 1 : 0); + } + /* Encode EOB decision only if k <= cinfo->Se */ + if (k <= cinfo->Se) { + st = entropy->ac_stats[tbl] + 3 * (k - 1); + arith_encode(cinfo, st, 1); + } + + return TRUE; +} + + +/* + * MCU encoding for DC successive approximation refinement scan. + */ + +METHODDEF(boolean) +encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data) +{ + arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy; + unsigned char *st; + int Al, blkn; + + /* Emit restart marker if needed */ + if (cinfo->restart_interval) { + if (entropy->restarts_to_go == 0) { + emit_restart(cinfo, entropy->next_restart_num); + entropy->restarts_to_go = cinfo->restart_interval; + entropy->next_restart_num++; + entropy->next_restart_num &= 7; + } + entropy->restarts_to_go--; + } + + st = entropy->fixed_bin; /* use fixed probability estimation */ + Al = cinfo->Al; + + /* Encode the MCU data blocks */ + for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) { + /* We simply emit the Al'th bit of the DC coefficient value. */ + arith_encode(cinfo, st, (MCU_data[blkn][0][0] >> Al) & 1); + } + + return TRUE; +} + + +/* + * MCU encoding for AC successive approximation refinement scan. + */ + +METHODDEF(boolean) +encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data) +{ + arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy; + JBLOCKROW block; + unsigned char *st; + int tbl, k, ke, kex; + int v; + + /* Emit restart marker if needed */ + if (cinfo->restart_interval) { + if (entropy->restarts_to_go == 0) { + emit_restart(cinfo, entropy->next_restart_num); + entropy->restarts_to_go = cinfo->restart_interval; + entropy->next_restart_num++; + entropy->next_restart_num &= 7; + } + entropy->restarts_to_go--; + } + + /* Encode the MCU data block */ + block = MCU_data[0]; + tbl = cinfo->cur_comp_info[0]->ac_tbl_no; + + /* Section G.1.3.3: Encoding of AC coefficients */ + + /* Establish EOB (end-of-block) index */ + for (ke = cinfo->Se; ke > 0; ke--) + /* We must apply the point transform by Al. For AC coefficients this + * is an integer division with rounding towards 0. To do this portably + * in C, we shift after obtaining the absolute value. + */ + if ((v = (*block)[jpeg_natural_order[ke]]) >= 0) { + if (v >>= cinfo->Al) break; + } else { + v = -v; + if (v >>= cinfo->Al) break; + } + + /* Establish EOBx (previous stage end-of-block) index */ + for (kex = ke; kex > 0; kex--) + if ((v = (*block)[jpeg_natural_order[kex]]) >= 0) { + if (v >>= cinfo->Ah) break; + } else { + v = -v; + if (v >>= cinfo->Ah) break; + } + + /* Figure G.10: Encode_AC_Coefficients_SA */ + for (k = cinfo->Ss; k <= ke; k++) { + st = entropy->ac_stats[tbl] + 3 * (k - 1); + if (k > kex) + arith_encode(cinfo, st, 0); /* EOB decision */ + for (;;) { + if ((v = (*block)[jpeg_natural_order[k]]) >= 0) { + if (v >>= cinfo->Al) { + if (v >> 1) /* previously nonzero coef */ + arith_encode(cinfo, st + 2, (v & 1)); + else { /* newly nonzero coef */ + arith_encode(cinfo, st + 1, 1); + arith_encode(cinfo, entropy->fixed_bin, 0); + } + break; + } + } else { + v = -v; + if (v >>= cinfo->Al) { + if (v >> 1) /* previously nonzero coef */ + arith_encode(cinfo, st + 2, (v & 1)); + else { /* newly nonzero coef */ + arith_encode(cinfo, st + 1, 1); + arith_encode(cinfo, entropy->fixed_bin, 1); + } + break; + } + } + arith_encode(cinfo, st + 1, 0); st += 3; k++; + } + } + /* Encode EOB decision only if k <= cinfo->Se */ + if (k <= cinfo->Se) { + st = entropy->ac_stats[tbl] + 3 * (k - 1); + arith_encode(cinfo, st, 1); + } + + return TRUE; +} + + +/* + * Encode and output one MCU's worth of arithmetic-compressed coefficients. + */ + +METHODDEF(boolean) +encode_mcu (j_compress_ptr cinfo, JBLOCKROW *MCU_data) +{ + arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy; + jpeg_component_info *compptr; + JBLOCKROW block; + unsigned char *st; + int blkn, ci, tbl, k, ke; + int v, v2, m; + + /* Emit restart marker if needed */ + if (cinfo->restart_interval) { + if (entropy->restarts_to_go == 0) { + emit_restart(cinfo, entropy->next_restart_num); + entropy->restarts_to_go = cinfo->restart_interval; + entropy->next_restart_num++; + entropy->next_restart_num &= 7; + } + entropy->restarts_to_go--; + } + + /* Encode the MCU data blocks */ + for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) { + block = MCU_data[blkn]; + ci = cinfo->MCU_membership[blkn]; + compptr = cinfo->cur_comp_info[ci]; + + /* Sections F.1.4.1 & F.1.4.4.1: Encoding of DC coefficients */ + + tbl = compptr->dc_tbl_no; + + /* Table F.4: Point to statistics bin S0 for DC coefficient coding */ + st = entropy->dc_stats[tbl] + entropy->dc_context[ci]; + + /* Figure F.4: Encode_DC_DIFF */ + if ((v = (*block)[0] - entropy->last_dc_val[ci]) == 0) { + arith_encode(cinfo, st, 0); + entropy->dc_context[ci] = 0; /* zero diff category */ + } else { + entropy->last_dc_val[ci] = (*block)[0]; + arith_encode(cinfo, st, 1); + /* Figure F.6: Encoding nonzero value v */ + /* Figure F.7: Encoding the sign of v */ + if (v > 0) { + arith_encode(cinfo, st + 1, 0); /* Table F.4: SS = S0 + 1 */ + st += 2; /* Table F.4: SP = S0 + 2 */ + entropy->dc_context[ci] = 4; /* small positive diff category */ + } else { + v = -v; + arith_encode(cinfo, st + 1, 1); /* Table F.4: SS = S0 + 1 */ + st += 3; /* Table F.4: SN = S0 + 3 */ + entropy->dc_context[ci] = 8; /* small negative diff category */ + } + /* Figure F.8: Encoding the magnitude category of v */ + m = 0; + if (v -= 1) { + arith_encode(cinfo, st, 1); + m = 1; + v2 = v; + st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */ + while (v2 >>= 1) { + arith_encode(cinfo, st, 1); + m <<= 1; + st += 1; + } + } + arith_encode(cinfo, st, 0); + /* Section F.1.4.4.1.2: Establish dc_context conditioning category */ + if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1)) + entropy->dc_context[ci] = 0; /* zero diff category */ + else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1)) + entropy->dc_context[ci] += 8; /* large diff category */ + /* Figure F.9: Encoding the magnitude bit pattern of v */ + st += 14; + while (m >>= 1) + arith_encode(cinfo, st, (m & v) ? 1 : 0); + } + + /* Sections F.1.4.2 & F.1.4.4.2: Encoding of AC coefficients */ + + tbl = compptr->ac_tbl_no; + + /* Establish EOB (end-of-block) index */ + for (ke = DCTSIZE2 - 1; ke > 0; ke--) + if ((*block)[jpeg_natural_order[ke]]) break; + + /* Figure F.5: Encode_AC_Coefficients */ + for (k = 1; k <= ke; k++) { + st = entropy->ac_stats[tbl] + 3 * (k - 1); + arith_encode(cinfo, st, 0); /* EOB decision */ + while ((v = (*block)[jpeg_natural_order[k]]) == 0) { + arith_encode(cinfo, st + 1, 0); st += 3; k++; + } + arith_encode(cinfo, st + 1, 1); + /* Figure F.6: Encoding nonzero value v */ + /* Figure F.7: Encoding the sign of v */ + if (v > 0) { + arith_encode(cinfo, entropy->fixed_bin, 0); + } else { + v = -v; + arith_encode(cinfo, entropy->fixed_bin, 1); + } + st += 2; + /* Figure F.8: Encoding the magnitude category of v */ + m = 0; + if (v -= 1) { + arith_encode(cinfo, st, 1); + m = 1; + v2 = v; + if (v2 >>= 1) { + arith_encode(cinfo, st, 1); + m <<= 1; + st = entropy->ac_stats[tbl] + + (k <= cinfo->arith_ac_K[tbl] ? 189 : 217); + while (v2 >>= 1) { + arith_encode(cinfo, st, 1); + m <<= 1; + st += 1; + } + } + } + arith_encode(cinfo, st, 0); + /* Figure F.9: Encoding the magnitude bit pattern of v */ + st += 14; + while (m >>= 1) + arith_encode(cinfo, st, (m & v) ? 1 : 0); + } + /* Encode EOB decision only if k <= DCTSIZE2 - 1 */ + if (k <= DCTSIZE2 - 1) { + st = entropy->ac_stats[tbl] + 3 * (k - 1); + arith_encode(cinfo, st, 1); + } + } + + return TRUE; +} + + +/* + * Initialize for an arithmetic-compressed scan. + */ + +METHODDEF(void) +start_pass (j_compress_ptr cinfo, boolean gather_statistics) +{ + arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy; + int ci, tbl; + jpeg_component_info *compptr; + + if (gather_statistics) + /* Make sure to avoid that in the master control logic! + * We are fully adaptive here and need no extra + * statistics gathering pass! + */ + ERREXIT(cinfo, JERR_NOT_COMPILED); + + /* We assume jcmaster.c already validated the progressive scan parameters. */ + + /* Select execution routines */ + if (cinfo->progressive_mode) { + if (cinfo->Ah == 0) { + if (cinfo->Ss == 0) + entropy->pub.encode_mcu = encode_mcu_DC_first; + else + entropy->pub.encode_mcu = encode_mcu_AC_first; + } else { + if (cinfo->Ss == 0) + entropy->pub.encode_mcu = encode_mcu_DC_refine; + else + entropy->pub.encode_mcu = encode_mcu_AC_refine; + } + } else + entropy->pub.encode_mcu = encode_mcu; + + /* Allocate & initialize requested statistics areas */ + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { + compptr = cinfo->cur_comp_info[ci]; + /* DC needs no table for refinement scan */ + if (cinfo->progressive_mode == 0 || (cinfo->Ss == 0 && cinfo->Ah == 0)) { + tbl = compptr->dc_tbl_no; + if (tbl < 0 || tbl >= NUM_ARITH_TBLS) + ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl); + if (entropy->dc_stats[tbl] == NULL) + entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small) + ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS); + MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS); + /* Initialize DC predictions to 0 */ + entropy->last_dc_val[ci] = 0; + entropy->dc_context[ci] = 0; + } + /* AC needs no table when not present */ + if (cinfo->progressive_mode == 0 || cinfo->Se) { + tbl = compptr->ac_tbl_no; + if (tbl < 0 || tbl >= NUM_ARITH_TBLS) + ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl); + if (entropy->ac_stats[tbl] == NULL) + entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small) + ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS); + MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS); +#ifdef CALCULATE_SPECTRAL_CONDITIONING + if (cinfo->progressive_mode) + /* Section G.1.3.2: Set appropriate arithmetic conditioning value Kx */ + cinfo->arith_ac_K[tbl] = cinfo->Ss + ((8 + cinfo->Se - cinfo->Ss) >> 4); +#endif + } + } + + /* Initialize arithmetic encoding variables */ + entropy->c = 0; + entropy->a = 0x10000L; + entropy->sc = 0; + entropy->zc = 0; + entropy->ct = 11; + entropy->buffer = -1; /* empty */ + + /* Initialize restart stuff */ + entropy->restarts_to_go = cinfo->restart_interval; + entropy->next_restart_num = 0; +} + + +/* + * Module initialization routine for arithmetic entropy encoding. + */ + +GLOBAL(void) +jinit_arith_encoder (j_compress_ptr cinfo) +{ + arith_entropy_ptr entropy; + int i; + + entropy = (arith_entropy_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(arith_entropy_encoder)); + cinfo->entropy = (struct jpeg_entropy_encoder *) entropy; + entropy->pub.start_pass = start_pass; + entropy->pub.finish_pass = finish_pass; + + /* Mark tables unallocated */ + for (i = 0; i < NUM_ARITH_TBLS; i++) { + entropy->dc_stats[i] = NULL; + entropy->ac_stats[i] = NULL; + } + + /* Initialize index for fixed probability estimation */ + entropy->fixed_bin[0] = 113; +} diff --git a/media/libjpeg/jccoefct.c b/media/libjpeg/jccoefct.c new file mode 100644 index 000000000..a08d6e323 --- /dev/null +++ b/media/libjpeg/jccoefct.c @@ -0,0 +1,449 @@ +/* + * jccoefct.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1994-1997, Thomas G. Lane. + * It was modified by The libjpeg-turbo Project to include only code and + * information relevant to libjpeg-turbo. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains the coefficient buffer controller for compression. + * This controller is the top level of the JPEG compressor proper. + * The coefficient buffer lies between forward-DCT and entropy encoding steps. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" + + +/* We use a full-image coefficient buffer when doing Huffman optimization, + * and also for writing multiple-scan JPEG files. In all cases, the DCT + * step is run during the first pass, and subsequent passes need only read + * the buffered coefficients. + */ +#ifdef ENTROPY_OPT_SUPPORTED +#define FULL_COEF_BUFFER_SUPPORTED +#else +#ifdef C_MULTISCAN_FILES_SUPPORTED +#define FULL_COEF_BUFFER_SUPPORTED +#endif +#endif + + +/* Private buffer controller object */ + +typedef struct { + struct jpeg_c_coef_controller pub; /* public fields */ + + JDIMENSION iMCU_row_num; /* iMCU row # within image */ + JDIMENSION mcu_ctr; /* counts MCUs processed in current row */ + int MCU_vert_offset; /* counts MCU rows within iMCU row */ + int MCU_rows_per_iMCU_row; /* number of such rows needed */ + + /* For single-pass compression, it's sufficient to buffer just one MCU + * (although this may prove a bit slow in practice). We allocate a + * workspace of C_MAX_BLOCKS_IN_MCU coefficient blocks, and reuse it for each + * MCU constructed and sent. In multi-pass modes, this array points to the + * current MCU's blocks within the virtual arrays. + */ + JBLOCKROW MCU_buffer[C_MAX_BLOCKS_IN_MCU]; + + /* In multi-pass modes, we need a virtual block array for each component. */ + jvirt_barray_ptr whole_image[MAX_COMPONENTS]; +} my_coef_controller; + +typedef my_coef_controller *my_coef_ptr; + + +/* Forward declarations */ +METHODDEF(boolean) compress_data + (j_compress_ptr cinfo, JSAMPIMAGE input_buf); +#ifdef FULL_COEF_BUFFER_SUPPORTED +METHODDEF(boolean) compress_first_pass + (j_compress_ptr cinfo, JSAMPIMAGE input_buf); +METHODDEF(boolean) compress_output + (j_compress_ptr cinfo, JSAMPIMAGE input_buf); +#endif + + +LOCAL(void) +start_iMCU_row (j_compress_ptr cinfo) +/* Reset within-iMCU-row counters for a new row */ +{ + my_coef_ptr coef = (my_coef_ptr) cinfo->coef; + + /* In an interleaved scan, an MCU row is the same as an iMCU row. + * In a noninterleaved scan, an iMCU row has v_samp_factor MCU rows. + * But at the bottom of the image, process only what's left. + */ + if (cinfo->comps_in_scan > 1) { + coef->MCU_rows_per_iMCU_row = 1; + } else { + if (coef->iMCU_row_num < (cinfo->total_iMCU_rows-1)) + coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->v_samp_factor; + else + coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height; + } + + coef->mcu_ctr = 0; + coef->MCU_vert_offset = 0; +} + + +/* + * Initialize for a processing pass. + */ + +METHODDEF(void) +start_pass_coef (j_compress_ptr cinfo, J_BUF_MODE pass_mode) +{ + my_coef_ptr coef = (my_coef_ptr) cinfo->coef; + + coef->iMCU_row_num = 0; + start_iMCU_row(cinfo); + + switch (pass_mode) { + case JBUF_PASS_THRU: + if (coef->whole_image[0] != NULL) + ERREXIT(cinfo, JERR_BAD_BUFFER_MODE); + coef->pub.compress_data = compress_data; + break; +#ifdef FULL_COEF_BUFFER_SUPPORTED + case JBUF_SAVE_AND_PASS: + if (coef->whole_image[0] == NULL) + ERREXIT(cinfo, JERR_BAD_BUFFER_MODE); + coef->pub.compress_data = compress_first_pass; + break; + case JBUF_CRANK_DEST: + if (coef->whole_image[0] == NULL) + ERREXIT(cinfo, JERR_BAD_BUFFER_MODE); + coef->pub.compress_data = compress_output; + break; +#endif + default: + ERREXIT(cinfo, JERR_BAD_BUFFER_MODE); + break; + } +} + + +/* + * Process some data in the single-pass case. + * We process the equivalent of one fully interleaved MCU row ("iMCU" row) + * per call, ie, v_samp_factor block rows for each component in the image. + * Returns TRUE if the iMCU row is completed, FALSE if suspended. + * + * NB: input_buf contains a plane for each component in image, + * which we index according to the component's SOF position. + */ + +METHODDEF(boolean) +compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf) +{ + my_coef_ptr coef = (my_coef_ptr) cinfo->coef; + JDIMENSION MCU_col_num; /* index of current MCU within row */ + JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1; + JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1; + int blkn, bi, ci, yindex, yoffset, blockcnt; + JDIMENSION ypos, xpos; + jpeg_component_info *compptr; + + /* Loop to write as much as one whole iMCU row */ + for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row; + yoffset++) { + for (MCU_col_num = coef->mcu_ctr; MCU_col_num <= last_MCU_col; + MCU_col_num++) { + /* Determine where data comes from in input_buf and do the DCT thing. + * Each call on forward_DCT processes a horizontal row of DCT blocks + * as wide as an MCU; we rely on having allocated the MCU_buffer[] blocks + * sequentially. Dummy blocks at the right or bottom edge are filled in + * specially. The data in them does not matter for image reconstruction, + * so we fill them with values that will encode to the smallest amount of + * data, viz: all zeroes in the AC entries, DC entries equal to previous + * block's DC value. (Thanks to Thomas Kinsman for this idea.) + */ + blkn = 0; + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { + compptr = cinfo->cur_comp_info[ci]; + blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width + : compptr->last_col_width; + xpos = MCU_col_num * compptr->MCU_sample_width; + ypos = yoffset * DCTSIZE; /* ypos == (yoffset+yindex) * DCTSIZE */ + for (yindex = 0; yindex < compptr->MCU_height; yindex++) { + if (coef->iMCU_row_num < last_iMCU_row || + yoffset+yindex < compptr->last_row_height) { + (*cinfo->fdct->forward_DCT) (cinfo, compptr, + input_buf[compptr->component_index], + coef->MCU_buffer[blkn], + ypos, xpos, (JDIMENSION) blockcnt); + if (blockcnt < compptr->MCU_width) { + /* Create some dummy blocks at the right edge of the image. */ + jzero_far((void *) coef->MCU_buffer[blkn + blockcnt], + (compptr->MCU_width - blockcnt) * sizeof(JBLOCK)); + for (bi = blockcnt; bi < compptr->MCU_width; bi++) { + coef->MCU_buffer[blkn+bi][0][0] = coef->MCU_buffer[blkn+bi-1][0][0]; + } + } + } else { + /* Create a row of dummy blocks at the bottom of the image. */ + jzero_far((void *) coef->MCU_buffer[blkn], + compptr->MCU_width * sizeof(JBLOCK)); + for (bi = 0; bi < compptr->MCU_width; bi++) { + coef->MCU_buffer[blkn+bi][0][0] = coef->MCU_buffer[blkn-1][0][0]; + } + } + blkn += compptr->MCU_width; + ypos += DCTSIZE; + } + } + /* Try to write the MCU. In event of a suspension failure, we will + * re-DCT the MCU on restart (a bit inefficient, could be fixed...) + */ + if (! (*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) { + /* Suspension forced; update state counters and exit */ + coef->MCU_vert_offset = yoffset; + coef->mcu_ctr = MCU_col_num; + return FALSE; + } + } + /* Completed an MCU row, but perhaps not an iMCU row */ + coef->mcu_ctr = 0; + } + /* Completed the iMCU row, advance counters for next one */ + coef->iMCU_row_num++; + start_iMCU_row(cinfo); + return TRUE; +} + + +#ifdef FULL_COEF_BUFFER_SUPPORTED + +/* + * Process some data in the first pass of a multi-pass case. + * We process the equivalent of one fully interleaved MCU row ("iMCU" row) + * per call, ie, v_samp_factor block rows for each component in the image. + * This amount of data is read from the source buffer, DCT'd and quantized, + * and saved into the virtual arrays. We also generate suitable dummy blocks + * as needed at the right and lower edges. (The dummy blocks are constructed + * in the virtual arrays, which have been padded appropriately.) This makes + * it possible for subsequent passes not to worry about real vs. dummy blocks. + * + * We must also emit the data to the entropy encoder. This is conveniently + * done by calling compress_output() after we've loaded the current strip + * of the virtual arrays. + * + * NB: input_buf contains a plane for each component in image. All + * components are DCT'd and loaded into the virtual arrays in this pass. + * However, it may be that only a subset of the components are emitted to + * the entropy encoder during this first pass; be careful about looking + * at the scan-dependent variables (MCU dimensions, etc). + */ + +METHODDEF(boolean) +compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf) +{ + my_coef_ptr coef = (my_coef_ptr) cinfo->coef; + JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1; + JDIMENSION blocks_across, MCUs_across, MCUindex; + int bi, ci, h_samp_factor, block_row, block_rows, ndummy; + JCOEF lastDC; + jpeg_component_info *compptr; + JBLOCKARRAY buffer; + JBLOCKROW thisblockrow, lastblockrow; + + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + /* Align the virtual buffer for this component. */ + buffer = (*cinfo->mem->access_virt_barray) + ((j_common_ptr) cinfo, coef->whole_image[ci], + coef->iMCU_row_num * compptr->v_samp_factor, + (JDIMENSION) compptr->v_samp_factor, TRUE); + /* Count non-dummy DCT block rows in this iMCU row. */ + if (coef->iMCU_row_num < last_iMCU_row) + block_rows = compptr->v_samp_factor; + else { + /* NB: can't use last_row_height here, since may not be set! */ + block_rows = (int) (compptr->height_in_blocks % compptr->v_samp_factor); + if (block_rows == 0) block_rows = compptr->v_samp_factor; + } + blocks_across = compptr->width_in_blocks; + h_samp_factor = compptr->h_samp_factor; + /* Count number of dummy blocks to be added at the right margin. */ + ndummy = (int) (blocks_across % h_samp_factor); + if (ndummy > 0) + ndummy = h_samp_factor - ndummy; + /* Perform DCT for all non-dummy blocks in this iMCU row. Each call + * on forward_DCT processes a complete horizontal row of DCT blocks. + */ + for (block_row = 0; block_row < block_rows; block_row++) { + thisblockrow = buffer[block_row]; + (*cinfo->fdct->forward_DCT) (cinfo, compptr, + input_buf[ci], thisblockrow, + (JDIMENSION) (block_row * DCTSIZE), + (JDIMENSION) 0, blocks_across); + if (ndummy > 0) { + /* Create dummy blocks at the right edge of the image. */ + thisblockrow += blocks_across; /* => first dummy block */ + jzero_far((void *) thisblockrow, ndummy * sizeof(JBLOCK)); + lastDC = thisblockrow[-1][0]; + for (bi = 0; bi < ndummy; bi++) { + thisblockrow[bi][0] = lastDC; + } + } + } + /* If at end of image, create dummy block rows as needed. + * The tricky part here is that within each MCU, we want the DC values + * of the dummy blocks to match the last real block's DC value. + * This squeezes a few more bytes out of the resulting file... + */ + if (coef->iMCU_row_num == last_iMCU_row) { + blocks_across += ndummy; /* include lower right corner */ + MCUs_across = blocks_across / h_samp_factor; + for (block_row = block_rows; block_row < compptr->v_samp_factor; + block_row++) { + thisblockrow = buffer[block_row]; + lastblockrow = buffer[block_row-1]; + jzero_far((void *) thisblockrow, + (size_t) (blocks_across * sizeof(JBLOCK))); + for (MCUindex = 0; MCUindex < MCUs_across; MCUindex++) { + lastDC = lastblockrow[h_samp_factor-1][0]; + for (bi = 0; bi < h_samp_factor; bi++) { + thisblockrow[bi][0] = lastDC; + } + thisblockrow += h_samp_factor; /* advance to next MCU in row */ + lastblockrow += h_samp_factor; + } + } + } + } + /* NB: compress_output will increment iMCU_row_num if successful. + * A suspension return will result in redoing all the work above next time. + */ + + /* Emit data to the entropy encoder, sharing code with subsequent passes */ + return compress_output(cinfo, input_buf); +} + + +/* + * Process some data in subsequent passes of a multi-pass case. + * We process the equivalent of one fully interleaved MCU row ("iMCU" row) + * per call, ie, v_samp_factor block rows for each component in the scan. + * The data is obtained from the virtual arrays and fed to the entropy coder. + * Returns TRUE if the iMCU row is completed, FALSE if suspended. + * + * NB: input_buf is ignored; it is likely to be a NULL pointer. + */ + +METHODDEF(boolean) +compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf) +{ + my_coef_ptr coef = (my_coef_ptr) cinfo->coef; + JDIMENSION MCU_col_num; /* index of current MCU within row */ + int blkn, ci, xindex, yindex, yoffset; + JDIMENSION start_col; + JBLOCKARRAY buffer[MAX_COMPS_IN_SCAN]; + JBLOCKROW buffer_ptr; + jpeg_component_info *compptr; + + /* Align the virtual buffers for the components used in this scan. + * NB: during first pass, this is safe only because the buffers will + * already be aligned properly, so jmemmgr.c won't need to do any I/O. + */ + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { + compptr = cinfo->cur_comp_info[ci]; + buffer[ci] = (*cinfo->mem->access_virt_barray) + ((j_common_ptr) cinfo, coef->whole_image[compptr->component_index], + coef->iMCU_row_num * compptr->v_samp_factor, + (JDIMENSION) compptr->v_samp_factor, FALSE); + } + + /* Loop to process one whole iMCU row */ + for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row; + yoffset++) { + for (MCU_col_num = coef->mcu_ctr; MCU_col_num < cinfo->MCUs_per_row; + MCU_col_num++) { + /* Construct list of pointers to DCT blocks belonging to this MCU */ + blkn = 0; /* index of current DCT block within MCU */ + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { + compptr = cinfo->cur_comp_info[ci]; + start_col = MCU_col_num * compptr->MCU_width; + for (yindex = 0; yindex < compptr->MCU_height; yindex++) { + buffer_ptr = buffer[ci][yindex+yoffset] + start_col; + for (xindex = 0; xindex < compptr->MCU_width; xindex++) { + coef->MCU_buffer[blkn++] = buffer_ptr++; + } + } + } + /* Try to write the MCU. */ + if (! (*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) { + /* Suspension forced; update state counters and exit */ + coef->MCU_vert_offset = yoffset; + coef->mcu_ctr = MCU_col_num; + return FALSE; + } + } + /* Completed an MCU row, but perhaps not an iMCU row */ + coef->mcu_ctr = 0; + } + /* Completed the iMCU row, advance counters for next one */ + coef->iMCU_row_num++; + start_iMCU_row(cinfo); + return TRUE; +} + +#endif /* FULL_COEF_BUFFER_SUPPORTED */ + + +/* + * Initialize coefficient buffer controller. + */ + +GLOBAL(void) +jinit_c_coef_controller (j_compress_ptr cinfo, boolean need_full_buffer) +{ + my_coef_ptr coef; + + coef = (my_coef_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(my_coef_controller)); + cinfo->coef = (struct jpeg_c_coef_controller *) coef; + coef->pub.start_pass = start_pass_coef; + + /* Create the coefficient buffer. */ + if (need_full_buffer) { +#ifdef FULL_COEF_BUFFER_SUPPORTED + /* Allocate a full-image virtual array for each component, */ + /* padded to a multiple of samp_factor DCT blocks in each direction. */ + int ci; + jpeg_component_info *compptr; + + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + coef->whole_image[ci] = (*cinfo->mem->request_virt_barray) + ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE, + (JDIMENSION) jround_up((long) compptr->width_in_blocks, + (long) compptr->h_samp_factor), + (JDIMENSION) jround_up((long) compptr->height_in_blocks, + (long) compptr->v_samp_factor), + (JDIMENSION) compptr->v_samp_factor); + } +#else + ERREXIT(cinfo, JERR_BAD_BUFFER_MODE); +#endif + } else { + /* We only need a single-MCU buffer. */ + JBLOCKROW buffer; + int i; + + buffer = (JBLOCKROW) + (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE, + C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK)); + for (i = 0; i < C_MAX_BLOCKS_IN_MCU; i++) { + coef->MCU_buffer[i] = buffer + i; + } + coef->whole_image[0] = NULL; /* flag for no virtual arrays */ + } +} diff --git a/media/libjpeg/jccolext.c b/media/libjpeg/jccolext.c new file mode 100644 index 000000000..479b32044 --- /dev/null +++ b/media/libjpeg/jccolext.c @@ -0,0 +1,148 @@ +/* + * jccolext.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1996, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2009-2012, 2015, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains input colorspace conversion routines. + */ + + +/* This file is included by jccolor.c */ + + +/* + * Convert some rows of samples to the JPEG colorspace. + * + * Note that we change from the application's interleaved-pixel format + * to our internal noninterleaved, one-plane-per-component format. + * The input buffer is therefore three times as wide as the output buffer. + * + * A starting row offset is provided only for the output buffer. The caller + * can easily adjust the passed input_buf value to accommodate any row + * offset required on that side. + */ + +INLINE +LOCAL(void) +rgb_ycc_convert_internal (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert; + register int r, g, b; + register JLONG * ctab = cconvert->rgb_ycc_tab; + register JSAMPROW inptr; + register JSAMPROW outptr0, outptr1, outptr2; + register JDIMENSION col; + JDIMENSION num_cols = cinfo->image_width; + + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr0 = output_buf[0][output_row]; + outptr1 = output_buf[1][output_row]; + outptr2 = output_buf[2][output_row]; + output_row++; + for (col = 0; col < num_cols; col++) { + r = GETJSAMPLE(inptr[RGB_RED]); + g = GETJSAMPLE(inptr[RGB_GREEN]); + b = GETJSAMPLE(inptr[RGB_BLUE]); + inptr += RGB_PIXELSIZE; + /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations + * must be too; we do not need an explicit range-limiting operation. + * Hence the value being shifted is never negative, and we don't + * need the general RIGHT_SHIFT macro. + */ + /* Y */ + outptr0[col] = (JSAMPLE) + ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF]) + >> SCALEBITS); + /* Cb */ + outptr1[col] = (JSAMPLE) + ((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF]) + >> SCALEBITS); + /* Cr */ + outptr2[col] = (JSAMPLE) + ((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF]) + >> SCALEBITS); + } + } +} + + +/**************** Cases other than RGB -> YCbCr **************/ + + +/* + * Convert some rows of samples to the JPEG colorspace. + * This version handles RGB->grayscale conversion, which is the same + * as the RGB->Y portion of RGB->YCbCr. + * We assume rgb_ycc_start has been called (we only use the Y tables). + */ + +INLINE +LOCAL(void) +rgb_gray_convert_internal (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert; + register int r, g, b; + register JLONG * ctab = cconvert->rgb_ycc_tab; + register JSAMPROW inptr; + register JSAMPROW outptr; + register JDIMENSION col; + JDIMENSION num_cols = cinfo->image_width; + + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr = output_buf[0][output_row]; + output_row++; + for (col = 0; col < num_cols; col++) { + r = GETJSAMPLE(inptr[RGB_RED]); + g = GETJSAMPLE(inptr[RGB_GREEN]); + b = GETJSAMPLE(inptr[RGB_BLUE]); + inptr += RGB_PIXELSIZE; + /* Y */ + outptr[col] = (JSAMPLE) + ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF]) + >> SCALEBITS); + } + } +} + + +/* + * Convert some rows of samples to the JPEG colorspace. + * This version handles extended RGB->plain RGB conversion + */ + +INLINE +LOCAL(void) +rgb_rgb_convert_internal (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + register JSAMPROW inptr; + register JSAMPROW outptr0, outptr1, outptr2; + register JDIMENSION col; + JDIMENSION num_cols = cinfo->image_width; + + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr0 = output_buf[0][output_row]; + outptr1 = output_buf[1][output_row]; + outptr2 = output_buf[2][output_row]; + output_row++; + for (col = 0; col < num_cols; col++) { + outptr0[col] = GETJSAMPLE(inptr[RGB_RED]); + outptr1[col] = GETJSAMPLE(inptr[RGB_GREEN]); + outptr2[col] = GETJSAMPLE(inptr[RGB_BLUE]); + inptr += RGB_PIXELSIZE; + } + } +} diff --git a/media/libjpeg/jccolor.c b/media/libjpeg/jccolor.c new file mode 100644 index 000000000..b973d101d --- /dev/null +++ b/media/libjpeg/jccolor.c @@ -0,0 +1,719 @@ +/* + * jccolor.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1996, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2009-2012, 2015, D. R. Commander. + * Copyright (C) 2014, MIPS Technologies, Inc., California. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains input colorspace conversion routines. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jsimd.h" +#include "jconfigint.h" + + +/* Private subobject */ + +typedef struct { + struct jpeg_color_converter pub; /* public fields */ + + /* Private state for RGB->YCC conversion */ + JLONG *rgb_ycc_tab; /* => table for RGB to YCbCr conversion */ +} my_color_converter; + +typedef my_color_converter *my_cconvert_ptr; + + +/**************** RGB -> YCbCr conversion: most common case **************/ + +/* + * YCbCr is defined per CCIR 601-1, except that Cb and Cr are + * normalized to the range 0..MAXJSAMPLE rather than -0.5 .. 0.5. + * The conversion equations to be implemented are therefore + * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + * (These numbers are derived from TIFF 6.0 section 21, dated 3-June-92.) + * Note: older versions of the IJG code used a zero offset of MAXJSAMPLE/2, + * rather than CENTERJSAMPLE, for Cb and Cr. This gave equal positive and + * negative swings for Cb/Cr, but meant that grayscale values (Cb=Cr=0) + * were not represented exactly. Now we sacrifice exact representation of + * maximum red and maximum blue in order to get exact grayscales. + * + * To avoid floating-point arithmetic, we represent the fractional constants + * as integers scaled up by 2^16 (about 4 digits precision); we have to divide + * the products by 2^16, with appropriate rounding, to get the correct answer. + * + * For even more speed, we avoid doing any multiplications in the inner loop + * by precalculating the constants times R,G,B for all possible values. + * For 8-bit JSAMPLEs this is very reasonable (only 256 entries per table); + * for 12-bit samples it is still acceptable. It's not very reasonable for + * 16-bit samples, but if you want lossless storage you shouldn't be changing + * colorspace anyway. + * The CENTERJSAMPLE offsets and the rounding fudge-factor of 0.5 are included + * in the tables to save adding them separately in the inner loop. + */ + +#define SCALEBITS 16 /* speediest right-shift on some machines */ +#define CBCR_OFFSET ((JLONG) CENTERJSAMPLE << SCALEBITS) +#define ONE_HALF ((JLONG) 1 << (SCALEBITS-1)) +#define FIX(x) ((JLONG) ((x) * (1L<<SCALEBITS) + 0.5)) + +/* We allocate one big table and divide it up into eight parts, instead of + * doing eight alloc_small requests. This lets us use a single table base + * address, which can be held in a register in the inner loops on many + * machines (more than can hold all eight addresses, anyway). + */ + +#define R_Y_OFF 0 /* offset to R => Y section */ +#define G_Y_OFF (1*(MAXJSAMPLE+1)) /* offset to G => Y section */ +#define B_Y_OFF (2*(MAXJSAMPLE+1)) /* etc. */ +#define R_CB_OFF (3*(MAXJSAMPLE+1)) +#define G_CB_OFF (4*(MAXJSAMPLE+1)) +#define B_CB_OFF (5*(MAXJSAMPLE+1)) +#define R_CR_OFF B_CB_OFF /* B=>Cb, R=>Cr are the same */ +#define G_CR_OFF (6*(MAXJSAMPLE+1)) +#define B_CR_OFF (7*(MAXJSAMPLE+1)) +#define TABLE_SIZE (8*(MAXJSAMPLE+1)) + + +/* Include inline routines for colorspace extensions */ + +#include "jccolext.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE + +#define RGB_RED EXT_RGB_RED +#define RGB_GREEN EXT_RGB_GREEN +#define RGB_BLUE EXT_RGB_BLUE +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define rgb_ycc_convert_internal extrgb_ycc_convert_internal +#define rgb_gray_convert_internal extrgb_gray_convert_internal +#define rgb_rgb_convert_internal extrgb_rgb_convert_internal +#include "jccolext.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef rgb_ycc_convert_internal +#undef rgb_gray_convert_internal +#undef rgb_rgb_convert_internal + +#define RGB_RED EXT_RGBX_RED +#define RGB_GREEN EXT_RGBX_GREEN +#define RGB_BLUE EXT_RGBX_BLUE +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define rgb_ycc_convert_internal extrgbx_ycc_convert_internal +#define rgb_gray_convert_internal extrgbx_gray_convert_internal +#define rgb_rgb_convert_internal extrgbx_rgb_convert_internal +#include "jccolext.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef rgb_ycc_convert_internal +#undef rgb_gray_convert_internal +#undef rgb_rgb_convert_internal + +#define RGB_RED EXT_BGR_RED +#define RGB_GREEN EXT_BGR_GREEN +#define RGB_BLUE EXT_BGR_BLUE +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define rgb_ycc_convert_internal extbgr_ycc_convert_internal +#define rgb_gray_convert_internal extbgr_gray_convert_internal +#define rgb_rgb_convert_internal extbgr_rgb_convert_internal +#include "jccolext.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef rgb_ycc_convert_internal +#undef rgb_gray_convert_internal +#undef rgb_rgb_convert_internal + +#define RGB_RED EXT_BGRX_RED +#define RGB_GREEN EXT_BGRX_GREEN +#define RGB_BLUE EXT_BGRX_BLUE +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define rgb_ycc_convert_internal extbgrx_ycc_convert_internal +#define rgb_gray_convert_internal extbgrx_gray_convert_internal +#define rgb_rgb_convert_internal extbgrx_rgb_convert_internal +#include "jccolext.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef rgb_ycc_convert_internal +#undef rgb_gray_convert_internal +#undef rgb_rgb_convert_internal + +#define RGB_RED EXT_XBGR_RED +#define RGB_GREEN EXT_XBGR_GREEN +#define RGB_BLUE EXT_XBGR_BLUE +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define rgb_ycc_convert_internal extxbgr_ycc_convert_internal +#define rgb_gray_convert_internal extxbgr_gray_convert_internal +#define rgb_rgb_convert_internal extxbgr_rgb_convert_internal +#include "jccolext.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef rgb_ycc_convert_internal +#undef rgb_gray_convert_internal +#undef rgb_rgb_convert_internal + +#define RGB_RED EXT_XRGB_RED +#define RGB_GREEN EXT_XRGB_GREEN +#define RGB_BLUE EXT_XRGB_BLUE +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define rgb_ycc_convert_internal extxrgb_ycc_convert_internal +#define rgb_gray_convert_internal extxrgb_gray_convert_internal +#define rgb_rgb_convert_internal extxrgb_rgb_convert_internal +#include "jccolext.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef rgb_ycc_convert_internal +#undef rgb_gray_convert_internal +#undef rgb_rgb_convert_internal + + +/* + * Initialize for RGB->YCC colorspace conversion. + */ + +METHODDEF(void) +rgb_ycc_start (j_compress_ptr cinfo) +{ + my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert; + JLONG *rgb_ycc_tab; + JLONG i; + + /* Allocate and fill in the conversion tables. */ + cconvert->rgb_ycc_tab = rgb_ycc_tab = (JLONG *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + (TABLE_SIZE * sizeof(JLONG))); + + for (i = 0; i <= MAXJSAMPLE; i++) { + rgb_ycc_tab[i+R_Y_OFF] = FIX(0.29900) * i; + rgb_ycc_tab[i+G_Y_OFF] = FIX(0.58700) * i; + rgb_ycc_tab[i+B_Y_OFF] = FIX(0.11400) * i + ONE_HALF; + rgb_ycc_tab[i+R_CB_OFF] = (-FIX(0.16874)) * i; + rgb_ycc_tab[i+G_CB_OFF] = (-FIX(0.33126)) * i; + /* We use a rounding fudge-factor of 0.5-epsilon for Cb and Cr. + * This ensures that the maximum output will round to MAXJSAMPLE + * not MAXJSAMPLE+1, and thus that we don't have to range-limit. + */ + rgb_ycc_tab[i+B_CB_OFF] = FIX(0.50000) * i + CBCR_OFFSET + ONE_HALF-1; +/* B=>Cb and R=>Cr tables are the same + rgb_ycc_tab[i+R_CR_OFF] = FIX(0.50000) * i + CBCR_OFFSET + ONE_HALF-1; +*/ + rgb_ycc_tab[i+G_CR_OFF] = (-FIX(0.41869)) * i; + rgb_ycc_tab[i+B_CR_OFF] = (-FIX(0.08131)) * i; + } +} + + +/* + * Convert some rows of samples to the JPEG colorspace. + */ + +METHODDEF(void) +rgb_ycc_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + extrgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row, + num_rows); + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + extrgbx_ycc_convert_internal(cinfo, input_buf, output_buf, output_row, + num_rows); + break; + case JCS_EXT_BGR: + extbgr_ycc_convert_internal(cinfo, input_buf, output_buf, output_row, + num_rows); + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + extbgrx_ycc_convert_internal(cinfo, input_buf, output_buf, output_row, + num_rows); + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + extxbgr_ycc_convert_internal(cinfo, input_buf, output_buf, output_row, + num_rows); + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + extxrgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row, + num_rows); + break; + default: + rgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row, + num_rows); + break; + } +} + + +/**************** Cases other than RGB -> YCbCr **************/ + + +/* + * Convert some rows of samples to the JPEG colorspace. + */ + +METHODDEF(void) +rgb_gray_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + extrgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row, + num_rows); + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + extrgbx_gray_convert_internal(cinfo, input_buf, output_buf, output_row, + num_rows); + break; + case JCS_EXT_BGR: + extbgr_gray_convert_internal(cinfo, input_buf, output_buf, output_row, + num_rows); + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + extbgrx_gray_convert_internal(cinfo, input_buf, output_buf, output_row, + num_rows); + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + extxbgr_gray_convert_internal(cinfo, input_buf, output_buf, output_row, + num_rows); + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + extxrgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row, + num_rows); + break; + default: + rgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row, + num_rows); + break; + } +} + + +/* + * Extended RGB to plain RGB conversion + */ + +METHODDEF(void) +rgb_rgb_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + extrgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row, + num_rows); + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + extrgbx_rgb_convert_internal(cinfo, input_buf, output_buf, output_row, + num_rows); + break; + case JCS_EXT_BGR: + extbgr_rgb_convert_internal(cinfo, input_buf, output_buf, output_row, + num_rows); + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + extbgrx_rgb_convert_internal(cinfo, input_buf, output_buf, output_row, + num_rows); + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + extxbgr_rgb_convert_internal(cinfo, input_buf, output_buf, output_row, + num_rows); + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + extxrgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row, + num_rows); + break; + default: + rgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row, + num_rows); + break; + } +} + + +/* + * Convert some rows of samples to the JPEG colorspace. + * This version handles Adobe-style CMYK->YCCK conversion, + * where we convert R=1-C, G=1-M, and B=1-Y to YCbCr using the same + * conversion as above, while passing K (black) unchanged. + * We assume rgb_ycc_start has been called. + */ + +METHODDEF(void) +cmyk_ycck_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert; + register int r, g, b; + register JLONG *ctab = cconvert->rgb_ycc_tab; + register JSAMPROW inptr; + register JSAMPROW outptr0, outptr1, outptr2, outptr3; + register JDIMENSION col; + JDIMENSION num_cols = cinfo->image_width; + + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr0 = output_buf[0][output_row]; + outptr1 = output_buf[1][output_row]; + outptr2 = output_buf[2][output_row]; + outptr3 = output_buf[3][output_row]; + output_row++; + for (col = 0; col < num_cols; col++) { + r = MAXJSAMPLE - GETJSAMPLE(inptr[0]); + g = MAXJSAMPLE - GETJSAMPLE(inptr[1]); + b = MAXJSAMPLE - GETJSAMPLE(inptr[2]); + /* K passes through as-is */ + outptr3[col] = inptr[3]; /* don't need GETJSAMPLE here */ + inptr += 4; + /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations + * must be too; we do not need an explicit range-limiting operation. + * Hence the value being shifted is never negative, and we don't + * need the general RIGHT_SHIFT macro. + */ + /* Y */ + outptr0[col] = (JSAMPLE) + ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF]) + >> SCALEBITS); + /* Cb */ + outptr1[col] = (JSAMPLE) + ((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF]) + >> SCALEBITS); + /* Cr */ + outptr2[col] = (JSAMPLE) + ((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF]) + >> SCALEBITS); + } + } +} + + +/* + * Convert some rows of samples to the JPEG colorspace. + * This version handles grayscale output with no conversion. + * The source can be either plain grayscale or YCbCr (since Y == gray). + */ + +METHODDEF(void) +grayscale_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + register JSAMPROW inptr; + register JSAMPROW outptr; + register JDIMENSION col; + JDIMENSION num_cols = cinfo->image_width; + int instride = cinfo->input_components; + + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr = output_buf[0][output_row]; + output_row++; + for (col = 0; col < num_cols; col++) { + outptr[col] = inptr[0]; /* don't need GETJSAMPLE() here */ + inptr += instride; + } + } +} + + +/* + * Convert some rows of samples to the JPEG colorspace. + * This version handles multi-component colorspaces without conversion. + * We assume input_components == num_components. + */ + +METHODDEF(void) +null_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + register JSAMPROW inptr; + register JSAMPROW outptr, outptr0, outptr1, outptr2, outptr3; + register JDIMENSION col; + register int ci; + int nc = cinfo->num_components; + JDIMENSION num_cols = cinfo->image_width; + + if (nc == 3) { + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr0 = output_buf[0][output_row]; + outptr1 = output_buf[1][output_row]; + outptr2 = output_buf[2][output_row]; + output_row++; + for (col = 0; col < num_cols; col++) { + outptr0[col] = *inptr++; + outptr1[col] = *inptr++; + outptr2[col] = *inptr++; + } + } + } else if (nc == 4) { + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr0 = output_buf[0][output_row]; + outptr1 = output_buf[1][output_row]; + outptr2 = output_buf[2][output_row]; + outptr3 = output_buf[3][output_row]; + output_row++; + for (col = 0; col < num_cols; col++) { + outptr0[col] = *inptr++; + outptr1[col] = *inptr++; + outptr2[col] = *inptr++; + outptr3[col] = *inptr++; + } + } + } else { + while (--num_rows >= 0) { + /* It seems fastest to make a separate pass for each component. */ + for (ci = 0; ci < nc; ci++) { + inptr = *input_buf; + outptr = output_buf[ci][output_row]; + for (col = 0; col < num_cols; col++) { + outptr[col] = inptr[ci]; /* don't need GETJSAMPLE() here */ + inptr += nc; + } + } + input_buf++; + output_row++; + } + } +} + + +/* + * Empty method for start_pass. + */ + +METHODDEF(void) +null_method (j_compress_ptr cinfo) +{ + /* no work needed */ +} + + +/* + * Module initialization routine for input colorspace conversion. + */ + +GLOBAL(void) +jinit_color_converter (j_compress_ptr cinfo) +{ + my_cconvert_ptr cconvert; + + cconvert = (my_cconvert_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(my_color_converter)); + cinfo->cconvert = (struct jpeg_color_converter *) cconvert; + /* set start_pass to null method until we find out differently */ + cconvert->pub.start_pass = null_method; + + /* Make sure input_components agrees with in_color_space */ + switch (cinfo->in_color_space) { + case JCS_GRAYSCALE: + if (cinfo->input_components != 1) + ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE); + break; + + case JCS_RGB: + case JCS_EXT_RGB: + case JCS_EXT_RGBX: + case JCS_EXT_BGR: + case JCS_EXT_BGRX: + case JCS_EXT_XBGR: + case JCS_EXT_XRGB: + case JCS_EXT_RGBA: + case JCS_EXT_BGRA: + case JCS_EXT_ABGR: + case JCS_EXT_ARGB: + if (cinfo->input_components != rgb_pixelsize[cinfo->in_color_space]) + ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE); + break; + + case JCS_YCbCr: + if (cinfo->input_components != 3) + ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE); + break; + + case JCS_CMYK: + case JCS_YCCK: + if (cinfo->input_components != 4) + ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE); + break; + + default: /* JCS_UNKNOWN can be anything */ + if (cinfo->input_components < 1) + ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE); + break; + } + + /* Check num_components, set conversion method based on requested space */ + switch (cinfo->jpeg_color_space) { + case JCS_GRAYSCALE: + if (cinfo->num_components != 1) + ERREXIT(cinfo, JERR_BAD_J_COLORSPACE); + if (cinfo->in_color_space == JCS_GRAYSCALE) + cconvert->pub.color_convert = grayscale_convert; + else if (cinfo->in_color_space == JCS_RGB || + cinfo->in_color_space == JCS_EXT_RGB || + cinfo->in_color_space == JCS_EXT_RGBX || + cinfo->in_color_space == JCS_EXT_BGR || + cinfo->in_color_space == JCS_EXT_BGRX || + cinfo->in_color_space == JCS_EXT_XBGR || + cinfo->in_color_space == JCS_EXT_XRGB || + cinfo->in_color_space == JCS_EXT_RGBA || + cinfo->in_color_space == JCS_EXT_BGRA || + cinfo->in_color_space == JCS_EXT_ABGR || + cinfo->in_color_space == JCS_EXT_ARGB) { + if (jsimd_can_rgb_gray()) + cconvert->pub.color_convert = jsimd_rgb_gray_convert; + else { + cconvert->pub.start_pass = rgb_ycc_start; + cconvert->pub.color_convert = rgb_gray_convert; + } + } else if (cinfo->in_color_space == JCS_YCbCr) + cconvert->pub.color_convert = grayscale_convert; + else + ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL); + break; + + case JCS_RGB: + if (cinfo->num_components != 3) + ERREXIT(cinfo, JERR_BAD_J_COLORSPACE); + if (rgb_red[cinfo->in_color_space] == 0 && + rgb_green[cinfo->in_color_space] == 1 && + rgb_blue[cinfo->in_color_space] == 2 && + rgb_pixelsize[cinfo->in_color_space] == 3) { +#if defined(__mips__) + if (jsimd_c_can_null_convert()) + cconvert->pub.color_convert = jsimd_c_null_convert; + else +#endif + cconvert->pub.color_convert = null_convert; + } else if (cinfo->in_color_space == JCS_RGB || + cinfo->in_color_space == JCS_EXT_RGB || + cinfo->in_color_space == JCS_EXT_RGBX || + cinfo->in_color_space == JCS_EXT_BGR || + cinfo->in_color_space == JCS_EXT_BGRX || + cinfo->in_color_space == JCS_EXT_XBGR || + cinfo->in_color_space == JCS_EXT_XRGB || + cinfo->in_color_space == JCS_EXT_RGBA || + cinfo->in_color_space == JCS_EXT_BGRA || + cinfo->in_color_space == JCS_EXT_ABGR || + cinfo->in_color_space == JCS_EXT_ARGB) + cconvert->pub.color_convert = rgb_rgb_convert; + else + ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL); + break; + + case JCS_YCbCr: + if (cinfo->num_components != 3) + ERREXIT(cinfo, JERR_BAD_J_COLORSPACE); + if (cinfo->in_color_space == JCS_RGB || + cinfo->in_color_space == JCS_EXT_RGB || + cinfo->in_color_space == JCS_EXT_RGBX || + cinfo->in_color_space == JCS_EXT_BGR || + cinfo->in_color_space == JCS_EXT_BGRX || + cinfo->in_color_space == JCS_EXT_XBGR || + cinfo->in_color_space == JCS_EXT_XRGB || + cinfo->in_color_space == JCS_EXT_RGBA || + cinfo->in_color_space == JCS_EXT_BGRA || + cinfo->in_color_space == JCS_EXT_ABGR || + cinfo->in_color_space == JCS_EXT_ARGB) { + if (jsimd_can_rgb_ycc()) + cconvert->pub.color_convert = jsimd_rgb_ycc_convert; + else { + cconvert->pub.start_pass = rgb_ycc_start; + cconvert->pub.color_convert = rgb_ycc_convert; + } + } else if (cinfo->in_color_space == JCS_YCbCr) { +#if defined(__mips__) + if (jsimd_c_can_null_convert()) + cconvert->pub.color_convert = jsimd_c_null_convert; + else +#endif + cconvert->pub.color_convert = null_convert; + } else + ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL); + break; + + case JCS_CMYK: + if (cinfo->num_components != 4) + ERREXIT(cinfo, JERR_BAD_J_COLORSPACE); + if (cinfo->in_color_space == JCS_CMYK) { +#if defined(__mips__) + if (jsimd_c_can_null_convert()) + cconvert->pub.color_convert = jsimd_c_null_convert; + else +#endif + cconvert->pub.color_convert = null_convert; + } else + ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL); + break; + + case JCS_YCCK: + if (cinfo->num_components != 4) + ERREXIT(cinfo, JERR_BAD_J_COLORSPACE); + if (cinfo->in_color_space == JCS_CMYK) { + cconvert->pub.start_pass = rgb_ycc_start; + cconvert->pub.color_convert = cmyk_ycck_convert; + } else if (cinfo->in_color_space == JCS_YCCK) { +#if defined(__mips__) + if (jsimd_c_can_null_convert()) + cconvert->pub.color_convert = jsimd_c_null_convert; + else +#endif + cconvert->pub.color_convert = null_convert; + } else + ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL); + break; + + default: /* allow null conversion of JCS_UNKNOWN */ + if (cinfo->jpeg_color_space != cinfo->in_color_space || + cinfo->num_components != cinfo->input_components) + ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL); +#if defined(__mips__) + if (jsimd_c_can_null_convert()) + cconvert->pub.color_convert = jsimd_c_null_convert; + else +#endif + cconvert->pub.color_convert = null_convert; + break; + } +} diff --git a/media/libjpeg/jcdctmgr.c b/media/libjpeg/jcdctmgr.c new file mode 100644 index 000000000..aef8517f9 --- /dev/null +++ b/media/libjpeg/jcdctmgr.c @@ -0,0 +1,721 @@ +/* + * jcdctmgr.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1994-1996, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2011, 2014-2015, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains the forward-DCT management logic. + * This code selects a particular DCT implementation to be used, + * and it performs related housekeeping chores including coefficient + * quantization. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jdct.h" /* Private declarations for DCT subsystem */ +#include "jsimddct.h" + + +/* Private subobject for this module */ + +typedef void (*forward_DCT_method_ptr) (DCTELEM *data); +typedef void (*float_DCT_method_ptr) (FAST_FLOAT *data); + +typedef void (*convsamp_method_ptr) (JSAMPARRAY sample_data, + JDIMENSION start_col, + DCTELEM *workspace); +typedef void (*float_convsamp_method_ptr) (JSAMPARRAY sample_data, + JDIMENSION start_col, + FAST_FLOAT *workspace); + +typedef void (*quantize_method_ptr) (JCOEFPTR coef_block, DCTELEM *divisors, + DCTELEM *workspace); +typedef void (*float_quantize_method_ptr) (JCOEFPTR coef_block, + FAST_FLOAT *divisors, + FAST_FLOAT *workspace); + +METHODDEF(void) quantize (JCOEFPTR, DCTELEM *, DCTELEM *); + +typedef struct { + struct jpeg_forward_dct pub; /* public fields */ + + /* Pointer to the DCT routine actually in use */ + forward_DCT_method_ptr dct; + convsamp_method_ptr convsamp; + quantize_method_ptr quantize; + + /* The actual post-DCT divisors --- not identical to the quant table + * entries, because of scaling (especially for an unnormalized DCT). + * Each table is given in normal array order. + */ + DCTELEM *divisors[NUM_QUANT_TBLS]; + + /* work area for FDCT subroutine */ + DCTELEM *workspace; + +#ifdef DCT_FLOAT_SUPPORTED + /* Same as above for the floating-point case. */ + float_DCT_method_ptr float_dct; + float_convsamp_method_ptr float_convsamp; + float_quantize_method_ptr float_quantize; + FAST_FLOAT *float_divisors[NUM_QUANT_TBLS]; + FAST_FLOAT *float_workspace; +#endif +} my_fdct_controller; + +typedef my_fdct_controller *my_fdct_ptr; + + +#if BITS_IN_JSAMPLE == 8 + +/* + * Find the highest bit in an integer through binary search. + */ + +LOCAL(int) +flss (UINT16 val) +{ + int bit; + + bit = 16; + + if (!val) + return 0; + + if (!(val & 0xff00)) { + bit -= 8; + val <<= 8; + } + if (!(val & 0xf000)) { + bit -= 4; + val <<= 4; + } + if (!(val & 0xc000)) { + bit -= 2; + val <<= 2; + } + if (!(val & 0x8000)) { + bit -= 1; + val <<= 1; + } + + return bit; +} + + +/* + * Compute values to do a division using reciprocal. + * + * This implementation is based on an algorithm described in + * "How to optimize for the Pentium family of microprocessors" + * (http://www.agner.org/assem/). + * More information about the basic algorithm can be found in + * the paper "Integer Division Using Reciprocals" by Robert Alverson. + * + * The basic idea is to replace x/d by x * d^-1. In order to store + * d^-1 with enough precision we shift it left a few places. It turns + * out that this algoright gives just enough precision, and also fits + * into DCTELEM: + * + * b = (the number of significant bits in divisor) - 1 + * r = (word size) + b + * f = 2^r / divisor + * + * f will not be an integer for most cases, so we need to compensate + * for the rounding error introduced: + * + * no fractional part: + * + * result = input >> r + * + * fractional part of f < 0.5: + * + * round f down to nearest integer + * result = ((input + 1) * f) >> r + * + * fractional part of f > 0.5: + * + * round f up to nearest integer + * result = (input * f) >> r + * + * This is the original algorithm that gives truncated results. But we + * want properly rounded results, so we replace "input" with + * "input + divisor/2". + * + * In order to allow SIMD implementations we also tweak the values to + * allow the same calculation to be made at all times: + * + * dctbl[0] = f rounded to nearest integer + * dctbl[1] = divisor / 2 (+ 1 if fractional part of f < 0.5) + * dctbl[2] = 1 << ((word size) * 2 - r) + * dctbl[3] = r - (word size) + * + * dctbl[2] is for stupid instruction sets where the shift operation + * isn't member wise (e.g. MMX). + * + * The reason dctbl[2] and dctbl[3] reduce the shift with (word size) + * is that most SIMD implementations have a "multiply and store top + * half" operation. + * + * Lastly, we store each of the values in their own table instead + * of in a consecutive manner, yet again in order to allow SIMD + * routines. + */ + +LOCAL(int) +compute_reciprocal (UINT16 divisor, DCTELEM *dtbl) +{ + UDCTELEM2 fq, fr; + UDCTELEM c; + int b, r; + + if (divisor == 1) { + /* divisor == 1 means unquantized, so these reciprocal/correction/shift + * values will cause the C quantization algorithm to act like the + * identity function. Since only the C quantization algorithm is used in + * these cases, the scale value is irrelevant. + */ + dtbl[DCTSIZE2 * 0] = (DCTELEM) 1; /* reciprocal */ + dtbl[DCTSIZE2 * 1] = (DCTELEM) 0; /* correction */ + dtbl[DCTSIZE2 * 2] = (DCTELEM) 1; /* scale */ + dtbl[DCTSIZE2 * 3] = -(DCTELEM) (sizeof(DCTELEM) * 8); /* shift */ + return 0; + } + + b = flss(divisor) - 1; + r = sizeof(DCTELEM) * 8 + b; + + fq = ((UDCTELEM2)1 << r) / divisor; + fr = ((UDCTELEM2)1 << r) % divisor; + + c = divisor / 2; /* for rounding */ + + if (fr == 0) { /* divisor is power of two */ + /* fq will be one bit too large to fit in DCTELEM, so adjust */ + fq >>= 1; + r--; + } else if (fr <= (divisor / 2U)) { /* fractional part is < 0.5 */ + c++; + } else { /* fractional part is > 0.5 */ + fq++; + } + + dtbl[DCTSIZE2 * 0] = (DCTELEM) fq; /* reciprocal */ + dtbl[DCTSIZE2 * 1] = (DCTELEM) c; /* correction + roundfactor */ +#ifdef WITH_SIMD + dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r)); /* scale */ +#else + dtbl[DCTSIZE2 * 2] = 1; +#endif + dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */ + + if(r <= 16) return 0; + else return 1; +} + +#endif + + +/* + * Initialize for a processing pass. + * Verify that all referenced Q-tables are present, and set up + * the divisor table for each one. + * In the current implementation, DCT of all components is done during + * the first pass, even if only some components will be output in the + * first scan. Hence all components should be examined here. + */ + +METHODDEF(void) +start_pass_fdctmgr (j_compress_ptr cinfo) +{ + my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct; + int ci, qtblno, i; + jpeg_component_info *compptr; + JQUANT_TBL *qtbl; + DCTELEM *dtbl; + + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + qtblno = compptr->quant_tbl_no; + /* Make sure specified quantization table is present */ + if (qtblno < 0 || qtblno >= NUM_QUANT_TBLS || + cinfo->quant_tbl_ptrs[qtblno] == NULL) + ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno); + qtbl = cinfo->quant_tbl_ptrs[qtblno]; + /* Compute divisors for this quant table */ + /* We may do this more than once for same table, but it's not a big deal */ + switch (cinfo->dct_method) { +#ifdef DCT_ISLOW_SUPPORTED + case JDCT_ISLOW: + /* For LL&M IDCT method, divisors are equal to raw quantization + * coefficients multiplied by 8 (to counteract scaling). + */ + if (fdct->divisors[qtblno] == NULL) { + fdct->divisors[qtblno] = (DCTELEM *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + (DCTSIZE2 * 4) * sizeof(DCTELEM)); + } + dtbl = fdct->divisors[qtblno]; + for (i = 0; i < DCTSIZE2; i++) { +#if BITS_IN_JSAMPLE == 8 + if (!compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]) && + fdct->quantize == jsimd_quantize) + fdct->quantize = quantize; +#else + dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3; +#endif + } + break; +#endif +#ifdef DCT_IFAST_SUPPORTED + case JDCT_IFAST: + { + /* For AA&N IDCT method, divisors are equal to quantization + * coefficients scaled by scalefactor[row]*scalefactor[col], where + * scalefactor[0] = 1 + * scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 + * We apply a further scale factor of 8. + */ +#define CONST_BITS 14 + static const INT16 aanscales[DCTSIZE2] = { + /* precomputed values scaled up by 14 bits */ + 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520, + 22725, 31521, 29692, 26722, 22725, 17855, 12299, 6270, + 21407, 29692, 27969, 25172, 21407, 16819, 11585, 5906, + 19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315, + 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520, + 12873, 17855, 16819, 15137, 12873, 10114, 6967, 3552, + 8867, 12299, 11585, 10426, 8867, 6967, 4799, 2446, + 4520, 6270, 5906, 5315, 4520, 3552, 2446, 1247 + }; + SHIFT_TEMPS + + if (fdct->divisors[qtblno] == NULL) { + fdct->divisors[qtblno] = (DCTELEM *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + (DCTSIZE2 * 4) * sizeof(DCTELEM)); + } + dtbl = fdct->divisors[qtblno]; + for (i = 0; i < DCTSIZE2; i++) { +#if BITS_IN_JSAMPLE == 8 + if (!compute_reciprocal( + DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i], + (JLONG) aanscales[i]), + CONST_BITS-3), &dtbl[i]) && + fdct->quantize == jsimd_quantize) + fdct->quantize = quantize; +#else + dtbl[i] = (DCTELEM) + DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i], + (JLONG) aanscales[i]), + CONST_BITS-3); +#endif + } + } + break; +#endif +#ifdef DCT_FLOAT_SUPPORTED + case JDCT_FLOAT: + { + /* For float AA&N IDCT method, divisors are equal to quantization + * coefficients scaled by scalefactor[row]*scalefactor[col], where + * scalefactor[0] = 1 + * scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 + * We apply a further scale factor of 8. + * What's actually stored is 1/divisor so that the inner loop can + * use a multiplication rather than a division. + */ + FAST_FLOAT *fdtbl; + int row, col; + static const double aanscalefactor[DCTSIZE] = { + 1.0, 1.387039845, 1.306562965, 1.175875602, + 1.0, 0.785694958, 0.541196100, 0.275899379 + }; + + if (fdct->float_divisors[qtblno] == NULL) { + fdct->float_divisors[qtblno] = (FAST_FLOAT *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + DCTSIZE2 * sizeof(FAST_FLOAT)); + } + fdtbl = fdct->float_divisors[qtblno]; + i = 0; + for (row = 0; row < DCTSIZE; row++) { + for (col = 0; col < DCTSIZE; col++) { + fdtbl[i] = (FAST_FLOAT) + (1.0 / (((double) qtbl->quantval[i] * + aanscalefactor[row] * aanscalefactor[col] * 8.0))); + i++; + } + } + } + break; +#endif + default: + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + } + } +} + + +/* + * Load data into workspace, applying unsigned->signed conversion. + */ + +METHODDEF(void) +convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace) +{ + register DCTELEM *workspaceptr; + register JSAMPROW elemptr; + register int elemr; + + workspaceptr = workspace; + for (elemr = 0; elemr < DCTSIZE; elemr++) { + elemptr = sample_data[elemr] + start_col; + +#if DCTSIZE == 8 /* unroll the inner loop */ + *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; + *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; + *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; + *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; + *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; + *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; + *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; + *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; +#else + { + register int elemc; + for (elemc = DCTSIZE; elemc > 0; elemc--) + *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; + } +#endif + } +} + + +/* + * Quantize/descale the coefficients, and store into coef_blocks[]. + */ + +METHODDEF(void) +quantize (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) +{ + int i; + DCTELEM temp; + JCOEFPTR output_ptr = coef_block; + +#if BITS_IN_JSAMPLE == 8 + + UDCTELEM recip, corr; + int shift; + UDCTELEM2 product; + + for (i = 0; i < DCTSIZE2; i++) { + temp = workspace[i]; + recip = divisors[i + DCTSIZE2 * 0]; + corr = divisors[i + DCTSIZE2 * 1]; + shift = divisors[i + DCTSIZE2 * 3]; + + if (temp < 0) { + temp = -temp; + product = (UDCTELEM2)(temp + corr) * recip; + product >>= shift + sizeof(DCTELEM)*8; + temp = (DCTELEM)product; + temp = -temp; + } else { + product = (UDCTELEM2)(temp + corr) * recip; + product >>= shift + sizeof(DCTELEM)*8; + temp = (DCTELEM)product; + } + output_ptr[i] = (JCOEF) temp; + } + +#else + + register DCTELEM qval; + + for (i = 0; i < DCTSIZE2; i++) { + qval = divisors[i]; + temp = workspace[i]; + /* Divide the coefficient value by qval, ensuring proper rounding. + * Since C does not specify the direction of rounding for negative + * quotients, we have to force the dividend positive for portability. + * + * In most files, at least half of the output values will be zero + * (at default quantization settings, more like three-quarters...) + * so we should ensure that this case is fast. On many machines, + * a comparison is enough cheaper than a divide to make a special test + * a win. Since both inputs will be nonnegative, we need only test + * for a < b to discover whether a/b is 0. + * If your machine's division is fast enough, define FAST_DIVIDE. + */ +#ifdef FAST_DIVIDE +#define DIVIDE_BY(a,b) a /= b +#else +#define DIVIDE_BY(a,b) if (a >= b) a /= b; else a = 0 +#endif + if (temp < 0) { + temp = -temp; + temp += qval>>1; /* for rounding */ + DIVIDE_BY(temp, qval); + temp = -temp; + } else { + temp += qval>>1; /* for rounding */ + DIVIDE_BY(temp, qval); + } + output_ptr[i] = (JCOEF) temp; + } + +#endif + +} + + +/* + * Perform forward DCT on one or more blocks of a component. + * + * The input samples are taken from the sample_data[] array starting at + * position start_row/start_col, and moving to the right for any additional + * blocks. The quantized coefficients are returned in coef_blocks[]. + */ + +METHODDEF(void) +forward_DCT (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY sample_data, JBLOCKROW coef_blocks, + JDIMENSION start_row, JDIMENSION start_col, + JDIMENSION num_blocks) +/* This version is used for integer DCT implementations. */ +{ + /* This routine is heavily used, so it's worth coding it tightly. */ + my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct; + DCTELEM *divisors = fdct->divisors[compptr->quant_tbl_no]; + DCTELEM *workspace; + JDIMENSION bi; + + /* Make sure the compiler doesn't look up these every pass */ + forward_DCT_method_ptr do_dct = fdct->dct; + convsamp_method_ptr do_convsamp = fdct->convsamp; + quantize_method_ptr do_quantize = fdct->quantize; + workspace = fdct->workspace; + + sample_data += start_row; /* fold in the vertical offset once */ + + for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) { + /* Load data into workspace, applying unsigned->signed conversion */ + (*do_convsamp) (sample_data, start_col, workspace); + + /* Perform the DCT */ + (*do_dct) (workspace); + + /* Quantize/descale the coefficients, and store into coef_blocks[] */ + (*do_quantize) (coef_blocks[bi], divisors, workspace); + } +} + + +#ifdef DCT_FLOAT_SUPPORTED + + +METHODDEF(void) +convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace) +{ + register FAST_FLOAT *workspaceptr; + register JSAMPROW elemptr; + register int elemr; + + workspaceptr = workspace; + for (elemr = 0; elemr < DCTSIZE; elemr++) { + elemptr = sample_data[elemr] + start_col; +#if DCTSIZE == 8 /* unroll the inner loop */ + *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); + *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); + *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); + *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); + *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); + *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); + *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); + *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); +#else + { + register int elemc; + for (elemc = DCTSIZE; elemc > 0; elemc--) + *workspaceptr++ = (FAST_FLOAT) + (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); + } +#endif + } +} + + +METHODDEF(void) +quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace) +{ + register FAST_FLOAT temp; + register int i; + register JCOEFPTR output_ptr = coef_block; + + for (i = 0; i < DCTSIZE2; i++) { + /* Apply the quantization and scaling factor */ + temp = workspace[i] * divisors[i]; + + /* Round to nearest integer. + * Since C does not specify the direction of rounding for negative + * quotients, we have to force the dividend positive for portability. + * The maximum coefficient size is +-16K (for 12-bit data), so this + * code should work for either 16-bit or 32-bit ints. + */ + output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384); + } +} + + +METHODDEF(void) +forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY sample_data, JBLOCKROW coef_blocks, + JDIMENSION start_row, JDIMENSION start_col, + JDIMENSION num_blocks) +/* This version is used for floating-point DCT implementations. */ +{ + /* This routine is heavily used, so it's worth coding it tightly. */ + my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct; + FAST_FLOAT *divisors = fdct->float_divisors[compptr->quant_tbl_no]; + FAST_FLOAT *workspace; + JDIMENSION bi; + + + /* Make sure the compiler doesn't look up these every pass */ + float_DCT_method_ptr do_dct = fdct->float_dct; + float_convsamp_method_ptr do_convsamp = fdct->float_convsamp; + float_quantize_method_ptr do_quantize = fdct->float_quantize; + workspace = fdct->float_workspace; + + sample_data += start_row; /* fold in the vertical offset once */ + + for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) { + /* Load data into workspace, applying unsigned->signed conversion */ + (*do_convsamp) (sample_data, start_col, workspace); + + /* Perform the DCT */ + (*do_dct) (workspace); + + /* Quantize/descale the coefficients, and store into coef_blocks[] */ + (*do_quantize) (coef_blocks[bi], divisors, workspace); + } +} + +#endif /* DCT_FLOAT_SUPPORTED */ + + +/* + * Initialize FDCT manager. + */ + +GLOBAL(void) +jinit_forward_dct (j_compress_ptr cinfo) +{ + my_fdct_ptr fdct; + int i; + + fdct = (my_fdct_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(my_fdct_controller)); + cinfo->fdct = (struct jpeg_forward_dct *) fdct; + fdct->pub.start_pass = start_pass_fdctmgr; + + /* First determine the DCT... */ + switch (cinfo->dct_method) { +#ifdef DCT_ISLOW_SUPPORTED + case JDCT_ISLOW: + fdct->pub.forward_DCT = forward_DCT; + if (jsimd_can_fdct_islow()) + fdct->dct = jsimd_fdct_islow; + else + fdct->dct = jpeg_fdct_islow; + break; +#endif +#ifdef DCT_IFAST_SUPPORTED + case JDCT_IFAST: + fdct->pub.forward_DCT = forward_DCT; + if (jsimd_can_fdct_ifast()) + fdct->dct = jsimd_fdct_ifast; + else + fdct->dct = jpeg_fdct_ifast; + break; +#endif +#ifdef DCT_FLOAT_SUPPORTED + case JDCT_FLOAT: + fdct->pub.forward_DCT = forward_DCT_float; + if (jsimd_can_fdct_float()) + fdct->float_dct = jsimd_fdct_float; + else + fdct->float_dct = jpeg_fdct_float; + break; +#endif + default: + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + } + + /* ...then the supporting stages. */ + switch (cinfo->dct_method) { +#ifdef DCT_ISLOW_SUPPORTED + case JDCT_ISLOW: +#endif +#ifdef DCT_IFAST_SUPPORTED + case JDCT_IFAST: +#endif +#if defined(DCT_ISLOW_SUPPORTED) || defined(DCT_IFAST_SUPPORTED) + if (jsimd_can_convsamp()) + fdct->convsamp = jsimd_convsamp; + else + fdct->convsamp = convsamp; + if (jsimd_can_quantize()) + fdct->quantize = jsimd_quantize; + else + fdct->quantize = quantize; + break; +#endif +#ifdef DCT_FLOAT_SUPPORTED + case JDCT_FLOAT: + if (jsimd_can_convsamp_float()) + fdct->float_convsamp = jsimd_convsamp_float; + else + fdct->float_convsamp = convsamp_float; + if (jsimd_can_quantize_float()) + fdct->float_quantize = jsimd_quantize_float; + else + fdct->float_quantize = quantize_float; + break; +#endif + default: + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + } + + /* Allocate workspace memory */ +#ifdef DCT_FLOAT_SUPPORTED + if (cinfo->dct_method == JDCT_FLOAT) + fdct->float_workspace = (FAST_FLOAT *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(FAST_FLOAT) * DCTSIZE2); + else +#endif + fdct->workspace = (DCTELEM *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(DCTELEM) * DCTSIZE2); + + /* Mark divisor tables unallocated */ + for (i = 0; i < NUM_QUANT_TBLS; i++) { + fdct->divisors[i] = NULL; +#ifdef DCT_FLOAT_SUPPORTED + fdct->float_divisors[i] = NULL; +#endif + } +} diff --git a/media/libjpeg/jchuff.c b/media/libjpeg/jchuff.c new file mode 100644 index 000000000..fffaacebc --- /dev/null +++ b/media/libjpeg/jchuff.c @@ -0,0 +1,1091 @@ +/* + * jchuff.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1997, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2009-2011, 2014-2016, D. R. Commander. + * Copyright (C) 2015, Matthieu Darbois. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains Huffman entropy encoding routines. + * + * Much of the complexity here has to do with supporting output suspension. + * If the data destination module demands suspension, we want to be able to + * back up to the start of the current MCU. To do this, we copy state + * variables into local working storage, and update them back to the + * permanent JPEG objects only upon successful completion of an MCU. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jsimd.h" +#include "jconfigint.h" +#include <limits.h> + +/* + * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be + * used for bit counting rather than the lookup table. This will reduce the + * memory footprint by 64k, which is important for some mobile applications + * that create many isolated instances of libjpeg-turbo (web browsers, for + * instance.) This may improve performance on some mobile platforms as well. + * This feature is enabled by default only on ARM processors, because some x86 + * chips have a slow implementation of bsr, and the use of clz/bsr cannot be + * shown to have a significant performance impact even on the x86 chips that + * have a fast implementation of it. When building for ARMv6, you can + * explicitly disable the use of clz/bsr by adding -mthumb to the compiler + * flags (this defines __thumb__). + */ + +/* NOTE: Both GCC and Clang define __GNUC__ */ +#if defined __GNUC__ && (defined __arm__ || defined __aarch64__) +#if !defined __thumb__ || defined __thumb2__ +#define USE_CLZ_INTRINSIC +#endif +#endif + +#ifdef USE_CLZ_INTRINSIC +#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x)) +#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0) +#else +#include "jpeg_nbits_table.h" +#define JPEG_NBITS(x) (jpeg_nbits_table[x]) +#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x) +#endif + +#ifndef min + #define min(a,b) ((a)<(b)?(a):(b)) +#endif + + +/* Expanded entropy encoder object for Huffman encoding. + * + * The savable_state subrecord contains fields that change within an MCU, + * but must not be updated permanently until we complete the MCU. + */ + +typedef struct { + size_t put_buffer; /* current bit-accumulation buffer */ + int put_bits; /* # of bits now in it */ + int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */ +} savable_state; + +/* This macro is to work around compilers with missing or broken + * structure assignment. You'll need to fix this code if you have + * such a compiler and you change MAX_COMPS_IN_SCAN. + */ + +#ifndef NO_STRUCT_ASSIGN +#define ASSIGN_STATE(dest,src) ((dest) = (src)) +#else +#if MAX_COMPS_IN_SCAN == 4 +#define ASSIGN_STATE(dest,src) \ + ((dest).put_buffer = (src).put_buffer, \ + (dest).put_bits = (src).put_bits, \ + (dest).last_dc_val[0] = (src).last_dc_val[0], \ + (dest).last_dc_val[1] = (src).last_dc_val[1], \ + (dest).last_dc_val[2] = (src).last_dc_val[2], \ + (dest).last_dc_val[3] = (src).last_dc_val[3]) +#endif +#endif + + +typedef struct { + struct jpeg_entropy_encoder pub; /* public fields */ + + savable_state saved; /* Bit buffer & DC state at start of MCU */ + + /* These fields are NOT loaded into local working state. */ + unsigned int restarts_to_go; /* MCUs left in this restart interval */ + int next_restart_num; /* next restart number to write (0-7) */ + + /* Pointers to derived tables (these workspaces have image lifespan) */ + c_derived_tbl *dc_derived_tbls[NUM_HUFF_TBLS]; + c_derived_tbl *ac_derived_tbls[NUM_HUFF_TBLS]; + +#ifdef ENTROPY_OPT_SUPPORTED /* Statistics tables for optimization */ + long *dc_count_ptrs[NUM_HUFF_TBLS]; + long *ac_count_ptrs[NUM_HUFF_TBLS]; +#endif + + int simd; +} huff_entropy_encoder; + +typedef huff_entropy_encoder *huff_entropy_ptr; + +/* Working state while writing an MCU. + * This struct contains all the fields that are needed by subroutines. + */ + +typedef struct { + JOCTET *next_output_byte; /* => next byte to write in buffer */ + size_t free_in_buffer; /* # of byte spaces remaining in buffer */ + savable_state cur; /* Current bit buffer & DC state */ + j_compress_ptr cinfo; /* dump_buffer needs access to this */ +} working_state; + + +/* Forward declarations */ +METHODDEF(boolean) encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data); +METHODDEF(void) finish_pass_huff (j_compress_ptr cinfo); +#ifdef ENTROPY_OPT_SUPPORTED +METHODDEF(boolean) encode_mcu_gather (j_compress_ptr cinfo, + JBLOCKROW *MCU_data); +METHODDEF(void) finish_pass_gather (j_compress_ptr cinfo); +#endif + + +/* + * Initialize for a Huffman-compressed scan. + * If gather_statistics is TRUE, we do not output anything during the scan, + * just count the Huffman symbols used and generate Huffman code tables. + */ + +METHODDEF(void) +start_pass_huff (j_compress_ptr cinfo, boolean gather_statistics) +{ + huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy; + int ci, dctbl, actbl; + jpeg_component_info *compptr; + + if (gather_statistics) { +#ifdef ENTROPY_OPT_SUPPORTED + entropy->pub.encode_mcu = encode_mcu_gather; + entropy->pub.finish_pass = finish_pass_gather; +#else + ERREXIT(cinfo, JERR_NOT_COMPILED); +#endif + } else { + entropy->pub.encode_mcu = encode_mcu_huff; + entropy->pub.finish_pass = finish_pass_huff; + } + + entropy->simd = jsimd_can_huff_encode_one_block(); + + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { + compptr = cinfo->cur_comp_info[ci]; + dctbl = compptr->dc_tbl_no; + actbl = compptr->ac_tbl_no; + if (gather_statistics) { +#ifdef ENTROPY_OPT_SUPPORTED + /* Check for invalid table indexes */ + /* (make_c_derived_tbl does this in the other path) */ + if (dctbl < 0 || dctbl >= NUM_HUFF_TBLS) + ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, dctbl); + if (actbl < 0 || actbl >= NUM_HUFF_TBLS) + ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, actbl); + /* Allocate and zero the statistics tables */ + /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */ + if (entropy->dc_count_ptrs[dctbl] == NULL) + entropy->dc_count_ptrs[dctbl] = (long *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + 257 * sizeof(long)); + MEMZERO(entropy->dc_count_ptrs[dctbl], 257 * sizeof(long)); + if (entropy->ac_count_ptrs[actbl] == NULL) + entropy->ac_count_ptrs[actbl] = (long *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + 257 * sizeof(long)); + MEMZERO(entropy->ac_count_ptrs[actbl], 257 * sizeof(long)); +#endif + } else { + /* Compute derived values for Huffman tables */ + /* We may do this more than once for a table, but it's not expensive */ + jpeg_make_c_derived_tbl(cinfo, TRUE, dctbl, + & entropy->dc_derived_tbls[dctbl]); + jpeg_make_c_derived_tbl(cinfo, FALSE, actbl, + & entropy->ac_derived_tbls[actbl]); + } + /* Initialize DC predictions to 0 */ + entropy->saved.last_dc_val[ci] = 0; + } + + /* Initialize bit buffer to empty */ + entropy->saved.put_buffer = 0; + entropy->saved.put_bits = 0; + + /* Initialize restart stuff */ + entropy->restarts_to_go = cinfo->restart_interval; + entropy->next_restart_num = 0; +} + + +/* + * Compute the derived values for a Huffman table. + * This routine also performs some validation checks on the table. + * + * Note this is also used by jcphuff.c. + */ + +GLOBAL(void) +jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno, + c_derived_tbl **pdtbl) +{ + JHUFF_TBL *htbl; + c_derived_tbl *dtbl; + int p, i, l, lastp, si, maxsymbol; + char huffsize[257]; + unsigned int huffcode[257]; + unsigned int code; + + /* Note that huffsize[] and huffcode[] are filled in code-length order, + * paralleling the order of the symbols themselves in htbl->huffval[]. + */ + + /* Find the input Huffman table */ + if (tblno < 0 || tblno >= NUM_HUFF_TBLS) + ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, tblno); + htbl = + isDC ? cinfo->dc_huff_tbl_ptrs[tblno] : cinfo->ac_huff_tbl_ptrs[tblno]; + if (htbl == NULL) + ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, tblno); + + /* Allocate a workspace if we haven't already done so. */ + if (*pdtbl == NULL) + *pdtbl = (c_derived_tbl *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(c_derived_tbl)); + dtbl = *pdtbl; + + /* Figure C.1: make table of Huffman code length for each symbol */ + + p = 0; + for (l = 1; l <= 16; l++) { + i = (int) htbl->bits[l]; + if (i < 0 || p + i > 256) /* protect against table overrun */ + ERREXIT(cinfo, JERR_BAD_HUFF_TABLE); + while (i--) + huffsize[p++] = (char) l; + } + huffsize[p] = 0; + lastp = p; + + /* Figure C.2: generate the codes themselves */ + /* We also validate that the counts represent a legal Huffman code tree. */ + + code = 0; + si = huffsize[0]; + p = 0; + while (huffsize[p]) { + while (((int) huffsize[p]) == si) { + huffcode[p++] = code; + code++; + } + /* code is now 1 more than the last code used for codelength si; but + * it must still fit in si bits, since no code is allowed to be all ones. + */ + if (((JLONG) code) >= (((JLONG) 1) << si)) + ERREXIT(cinfo, JERR_BAD_HUFF_TABLE); + code <<= 1; + si++; + } + + /* Figure C.3: generate encoding tables */ + /* These are code and size indexed by symbol value */ + + /* Set all codeless symbols to have code length 0; + * this lets us detect duplicate VAL entries here, and later + * allows emit_bits to detect any attempt to emit such symbols. + */ + MEMZERO(dtbl->ehufsi, sizeof(dtbl->ehufsi)); + + /* This is also a convenient place to check for out-of-range + * and duplicated VAL entries. We allow 0..255 for AC symbols + * but only 0..15 for DC. (We could constrain them further + * based on data depth and mode, but this seems enough.) + */ + maxsymbol = isDC ? 15 : 255; + + for (p = 0; p < lastp; p++) { + i = htbl->huffval[p]; + if (i < 0 || i > maxsymbol || dtbl->ehufsi[i]) + ERREXIT(cinfo, JERR_BAD_HUFF_TABLE); + dtbl->ehufco[i] = huffcode[p]; + dtbl->ehufsi[i] = huffsize[p]; + } +} + + +/* Outputting bytes to the file */ + +/* Emit a byte, taking 'action' if must suspend. */ +#define emit_byte(state,val,action) \ + { *(state)->next_output_byte++ = (JOCTET) (val); \ + if (--(state)->free_in_buffer == 0) \ + if (! dump_buffer(state)) \ + { action; } } + + +LOCAL(boolean) +dump_buffer (working_state *state) +/* Empty the output buffer; return TRUE if successful, FALSE if must suspend */ +{ + struct jpeg_destination_mgr *dest = state->cinfo->dest; + + if (! (*dest->empty_output_buffer) (state->cinfo)) + return FALSE; + /* After a successful buffer dump, must reset buffer pointers */ + state->next_output_byte = dest->next_output_byte; + state->free_in_buffer = dest->free_in_buffer; + return TRUE; +} + + +/* Outputting bits to the file */ + +/* These macros perform the same task as the emit_bits() function in the + * original libjpeg code. In addition to reducing overhead by explicitly + * inlining the code, additional performance is achieved by taking into + * account the size of the bit buffer and waiting until it is almost full + * before emptying it. This mostly benefits 64-bit platforms, since 6 + * bytes can be stored in a 64-bit bit buffer before it has to be emptied. + */ + +#define EMIT_BYTE() { \ + JOCTET c; \ + put_bits -= 8; \ + c = (JOCTET)GETJOCTET(put_buffer >> put_bits); \ + *buffer++ = c; \ + if (c == 0xFF) /* need to stuff a zero byte? */ \ + *buffer++ = 0; \ + } + +#define PUT_BITS(code, size) { \ + put_bits += size; \ + put_buffer = (put_buffer << size) | code; \ +} + +#define CHECKBUF15() { \ + if (put_bits > 15) { \ + EMIT_BYTE() \ + EMIT_BYTE() \ + } \ +} + +#define CHECKBUF31() { \ + if (put_bits > 31) { \ + EMIT_BYTE() \ + EMIT_BYTE() \ + EMIT_BYTE() \ + EMIT_BYTE() \ + } \ +} + +#define CHECKBUF47() { \ + if (put_bits > 47) { \ + EMIT_BYTE() \ + EMIT_BYTE() \ + EMIT_BYTE() \ + EMIT_BYTE() \ + EMIT_BYTE() \ + EMIT_BYTE() \ + } \ +} + +#if !defined(_WIN32) && !defined(SIZEOF_SIZE_T) +#error Cannot determine word size +#endif + +#if SIZEOF_SIZE_T==8 || defined(_WIN64) + +#define EMIT_BITS(code, size) { \ + CHECKBUF47() \ + PUT_BITS(code, size) \ +} + +#define EMIT_CODE(code, size) { \ + temp2 &= (((JLONG) 1)<<nbits) - 1; \ + CHECKBUF31() \ + PUT_BITS(code, size) \ + PUT_BITS(temp2, nbits) \ + } + +#else + +#define EMIT_BITS(code, size) { \ + PUT_BITS(code, size) \ + CHECKBUF15() \ +} + +#define EMIT_CODE(code, size) { \ + temp2 &= (((JLONG) 1)<<nbits) - 1; \ + PUT_BITS(code, size) \ + CHECKBUF15() \ + PUT_BITS(temp2, nbits) \ + CHECKBUF15() \ + } + +#endif + + +/* Although it is exceedingly rare, it is possible for a Huffman-encoded + * coefficient block to be larger than the 128-byte unencoded block. For each + * of the 64 coefficients, PUT_BITS is invoked twice, and each invocation can + * theoretically store 16 bits (for a maximum of 2048 bits or 256 bytes per + * encoded block.) If, for instance, one artificially sets the AC + * coefficients to alternating values of 32767 and -32768 (using the JPEG + * scanning order-- 1, 8, 16, etc.), then this will produce an encoded block + * larger than 200 bytes. + */ +#define BUFSIZE (DCTSIZE2 * 4) + +#define LOAD_BUFFER() { \ + if (state->free_in_buffer < BUFSIZE) { \ + localbuf = 1; \ + buffer = _buffer; \ + } \ + else buffer = state->next_output_byte; \ + } + +#define STORE_BUFFER() { \ + if (localbuf) { \ + bytes = buffer - _buffer; \ + buffer = _buffer; \ + while (bytes > 0) { \ + bytestocopy = min(bytes, state->free_in_buffer); \ + MEMCOPY(state->next_output_byte, buffer, bytestocopy); \ + state->next_output_byte += bytestocopy; \ + buffer += bytestocopy; \ + state->free_in_buffer -= bytestocopy; \ + if (state->free_in_buffer == 0) \ + if (! dump_buffer(state)) return FALSE; \ + bytes -= bytestocopy; \ + } \ + } \ + else { \ + state->free_in_buffer -= (buffer - state->next_output_byte); \ + state->next_output_byte = buffer; \ + } \ + } + + +LOCAL(boolean) +flush_bits (working_state *state) +{ + JOCTET _buffer[BUFSIZE], *buffer; + size_t put_buffer; int put_bits; + size_t bytes, bytestocopy; int localbuf = 0; + + put_buffer = state->cur.put_buffer; + put_bits = state->cur.put_bits; + LOAD_BUFFER() + + /* fill any partial byte with ones */ + PUT_BITS(0x7F, 7) + while (put_bits >= 8) EMIT_BYTE() + + state->cur.put_buffer = 0; /* and reset bit-buffer to empty */ + state->cur.put_bits = 0; + STORE_BUFFER() + + return TRUE; +} + + +/* Encode a single block's worth of coefficients */ + +LOCAL(boolean) +encode_one_block_simd (working_state *state, JCOEFPTR block, int last_dc_val, + c_derived_tbl *dctbl, c_derived_tbl *actbl) +{ + JOCTET _buffer[BUFSIZE], *buffer; + size_t bytes, bytestocopy; int localbuf = 0; + + LOAD_BUFFER() + + buffer = jsimd_huff_encode_one_block(state, buffer, block, last_dc_val, + dctbl, actbl); + + STORE_BUFFER() + + return TRUE; +} + +LOCAL(boolean) +encode_one_block (working_state *state, JCOEFPTR block, int last_dc_val, + c_derived_tbl *dctbl, c_derived_tbl *actbl) +{ + int temp, temp2, temp3; + int nbits; + int r, code, size; + JOCTET _buffer[BUFSIZE], *buffer; + size_t put_buffer; int put_bits; + int code_0xf0 = actbl->ehufco[0xf0], size_0xf0 = actbl->ehufsi[0xf0]; + size_t bytes, bytestocopy; int localbuf = 0; + + put_buffer = state->cur.put_buffer; + put_bits = state->cur.put_bits; + LOAD_BUFFER() + + /* Encode the DC coefficient difference per section F.1.2.1 */ + + temp = temp2 = block[0] - last_dc_val; + + /* This is a well-known technique for obtaining the absolute value without a + * branch. It is derived from an assembly language technique presented in + * "How to Optimize for the Pentium Processors", Copyright (c) 1996, 1997 by + * Agner Fog. + */ + temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); + temp ^= temp3; + temp -= temp3; + + /* For a negative input, want temp2 = bitwise complement of abs(input) */ + /* This code assumes we are on a two's complement machine */ + temp2 += temp3; + + /* Find the number of bits needed for the magnitude of the coefficient */ + nbits = JPEG_NBITS(temp); + + /* Emit the Huffman-coded symbol for the number of bits */ + code = dctbl->ehufco[nbits]; + size = dctbl->ehufsi[nbits]; + EMIT_BITS(code, size) + + /* Mask off any extra bits in code */ + temp2 &= (((JLONG) 1)<<nbits) - 1; + + /* Emit that number of bits of the value, if positive, */ + /* or the complement of its magnitude, if negative. */ + EMIT_BITS(temp2, nbits) + + /* Encode the AC coefficients per section F.1.2.2 */ + + r = 0; /* r = run length of zeros */ + +/* Manually unroll the k loop to eliminate the counter variable. This + * improves performance greatly on systems with a limited number of + * registers (such as x86.) + */ +#define kloop(jpeg_natural_order_of_k) { \ + if ((temp = block[jpeg_natural_order_of_k]) == 0) { \ + r++; \ + } else { \ + temp2 = temp; \ + /* Branch-less absolute value, bitwise complement, etc., same as above */ \ + temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); \ + temp ^= temp3; \ + temp -= temp3; \ + temp2 += temp3; \ + nbits = JPEG_NBITS_NONZERO(temp); \ + /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \ + while (r > 15) { \ + EMIT_BITS(code_0xf0, size_0xf0) \ + r -= 16; \ + } \ + /* Emit Huffman symbol for run length / number of bits */ \ + temp3 = (r << 4) + nbits; \ + code = actbl->ehufco[temp3]; \ + size = actbl->ehufsi[temp3]; \ + EMIT_CODE(code, size) \ + r = 0; \ + } \ +} + + /* One iteration for each value in jpeg_natural_order[] */ + kloop(1); kloop(8); kloop(16); kloop(9); kloop(2); kloop(3); + kloop(10); kloop(17); kloop(24); kloop(32); kloop(25); kloop(18); + kloop(11); kloop(4); kloop(5); kloop(12); kloop(19); kloop(26); + kloop(33); kloop(40); kloop(48); kloop(41); kloop(34); kloop(27); + kloop(20); kloop(13); kloop(6); kloop(7); kloop(14); kloop(21); + kloop(28); kloop(35); kloop(42); kloop(49); kloop(56); kloop(57); + kloop(50); kloop(43); kloop(36); kloop(29); kloop(22); kloop(15); + kloop(23); kloop(30); kloop(37); kloop(44); kloop(51); kloop(58); + kloop(59); kloop(52); kloop(45); kloop(38); kloop(31); kloop(39); + kloop(46); kloop(53); kloop(60); kloop(61); kloop(54); kloop(47); + kloop(55); kloop(62); kloop(63); + + /* If the last coef(s) were zero, emit an end-of-block code */ + if (r > 0) { + code = actbl->ehufco[0]; + size = actbl->ehufsi[0]; + EMIT_BITS(code, size) + } + + state->cur.put_buffer = put_buffer; + state->cur.put_bits = put_bits; + STORE_BUFFER() + + return TRUE; +} + + +/* + * Emit a restart marker & resynchronize predictions. + */ + +LOCAL(boolean) +emit_restart (working_state *state, int restart_num) +{ + int ci; + + if (! flush_bits(state)) + return FALSE; + + emit_byte(state, 0xFF, return FALSE); + emit_byte(state, JPEG_RST0 + restart_num, return FALSE); + + /* Re-initialize DC predictions to 0 */ + for (ci = 0; ci < state->cinfo->comps_in_scan; ci++) + state->cur.last_dc_val[ci] = 0; + + /* The restart counter is not updated until we successfully write the MCU. */ + + return TRUE; +} + + +/* + * Encode and output one MCU's worth of Huffman-compressed coefficients. + */ + +METHODDEF(boolean) +encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data) +{ + huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy; + working_state state; + int blkn, ci; + jpeg_component_info *compptr; + + /* Load up working state */ + state.next_output_byte = cinfo->dest->next_output_byte; + state.free_in_buffer = cinfo->dest->free_in_buffer; + ASSIGN_STATE(state.cur, entropy->saved); + state.cinfo = cinfo; + + /* Emit restart marker if needed */ + if (cinfo->restart_interval) { + if (entropy->restarts_to_go == 0) + if (! emit_restart(&state, entropy->next_restart_num)) + return FALSE; + } + + /* Encode the MCU data blocks */ + if (entropy->simd) { + for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) { + ci = cinfo->MCU_membership[blkn]; + compptr = cinfo->cur_comp_info[ci]; + if (! encode_one_block_simd(&state, + MCU_data[blkn][0], state.cur.last_dc_val[ci], + entropy->dc_derived_tbls[compptr->dc_tbl_no], + entropy->ac_derived_tbls[compptr->ac_tbl_no])) + return FALSE; + /* Update last_dc_val */ + state.cur.last_dc_val[ci] = MCU_data[blkn][0][0]; + } + } else { + for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) { + ci = cinfo->MCU_membership[blkn]; + compptr = cinfo->cur_comp_info[ci]; + if (! encode_one_block(&state, + MCU_data[blkn][0], state.cur.last_dc_val[ci], + entropy->dc_derived_tbls[compptr->dc_tbl_no], + entropy->ac_derived_tbls[compptr->ac_tbl_no])) + return FALSE; + /* Update last_dc_val */ + state.cur.last_dc_val[ci] = MCU_data[blkn][0][0]; + } + } + + /* Completed MCU, so update state */ + cinfo->dest->next_output_byte = state.next_output_byte; + cinfo->dest->free_in_buffer = state.free_in_buffer; + ASSIGN_STATE(entropy->saved, state.cur); + + /* Update restart-interval state too */ + if (cinfo->restart_interval) { + if (entropy->restarts_to_go == 0) { + entropy->restarts_to_go = cinfo->restart_interval; + entropy->next_restart_num++; + entropy->next_restart_num &= 7; + } + entropy->restarts_to_go--; + } + + return TRUE; +} + + +/* + * Finish up at the end of a Huffman-compressed scan. + */ + +METHODDEF(void) +finish_pass_huff (j_compress_ptr cinfo) +{ + huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy; + working_state state; + + /* Load up working state ... flush_bits needs it */ + state.next_output_byte = cinfo->dest->next_output_byte; + state.free_in_buffer = cinfo->dest->free_in_buffer; + ASSIGN_STATE(state.cur, entropy->saved); + state.cinfo = cinfo; + + /* Flush out the last data */ + if (! flush_bits(&state)) + ERREXIT(cinfo, JERR_CANT_SUSPEND); + + /* Update state */ + cinfo->dest->next_output_byte = state.next_output_byte; + cinfo->dest->free_in_buffer = state.free_in_buffer; + ASSIGN_STATE(entropy->saved, state.cur); +} + + +/* + * Huffman coding optimization. + * + * We first scan the supplied data and count the number of uses of each symbol + * that is to be Huffman-coded. (This process MUST agree with the code above.) + * Then we build a Huffman coding tree for the observed counts. + * Symbols which are not needed at all for the particular image are not + * assigned any code, which saves space in the DHT marker as well as in + * the compressed data. + */ + +#ifdef ENTROPY_OPT_SUPPORTED + + +/* Process a single block's worth of coefficients */ + +LOCAL(void) +htest_one_block (j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val, + long dc_counts[], long ac_counts[]) +{ + register int temp; + register int nbits; + register int k, r; + + /* Encode the DC coefficient difference per section F.1.2.1 */ + + temp = block[0] - last_dc_val; + if (temp < 0) + temp = -temp; + + /* Find the number of bits needed for the magnitude of the coefficient */ + nbits = 0; + while (temp) { + nbits++; + temp >>= 1; + } + /* Check for out-of-range coefficient values. + * Since we're encoding a difference, the range limit is twice as much. + */ + if (nbits > MAX_COEF_BITS+1) + ERREXIT(cinfo, JERR_BAD_DCT_COEF); + + /* Count the Huffman symbol for the number of bits */ + dc_counts[nbits]++; + + /* Encode the AC coefficients per section F.1.2.2 */ + + r = 0; /* r = run length of zeros */ + + for (k = 1; k < DCTSIZE2; k++) { + if ((temp = block[jpeg_natural_order[k]]) == 0) { + r++; + } else { + /* if run length > 15, must emit special run-length-16 codes (0xF0) */ + while (r > 15) { + ac_counts[0xF0]++; + r -= 16; + } + + /* Find the number of bits needed for the magnitude of the coefficient */ + if (temp < 0) + temp = -temp; + + /* Find the number of bits needed for the magnitude of the coefficient */ + nbits = 1; /* there must be at least one 1 bit */ + while ((temp >>= 1)) + nbits++; + /* Check for out-of-range coefficient values */ + if (nbits > MAX_COEF_BITS) + ERREXIT(cinfo, JERR_BAD_DCT_COEF); + + /* Count Huffman symbol for run length / number of bits */ + ac_counts[(r << 4) + nbits]++; + + r = 0; + } + } + + /* If the last coef(s) were zero, emit an end-of-block code */ + if (r > 0) + ac_counts[0]++; +} + + +/* + * Trial-encode one MCU's worth of Huffman-compressed coefficients. + * No data is actually output, so no suspension return is possible. + */ + +METHODDEF(boolean) +encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data) +{ + huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy; + int blkn, ci; + jpeg_component_info *compptr; + + /* Take care of restart intervals if needed */ + if (cinfo->restart_interval) { + if (entropy->restarts_to_go == 0) { + /* Re-initialize DC predictions to 0 */ + for (ci = 0; ci < cinfo->comps_in_scan; ci++) + entropy->saved.last_dc_val[ci] = 0; + /* Update restart state */ + entropy->restarts_to_go = cinfo->restart_interval; + } + entropy->restarts_to_go--; + } + + for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) { + ci = cinfo->MCU_membership[blkn]; + compptr = cinfo->cur_comp_info[ci]; + htest_one_block(cinfo, MCU_data[blkn][0], entropy->saved.last_dc_val[ci], + entropy->dc_count_ptrs[compptr->dc_tbl_no], + entropy->ac_count_ptrs[compptr->ac_tbl_no]); + entropy->saved.last_dc_val[ci] = MCU_data[blkn][0][0]; + } + + return TRUE; +} + + +/* + * Generate the best Huffman code table for the given counts, fill htbl. + * Note this is also used by jcphuff.c. + * + * The JPEG standard requires that no symbol be assigned a codeword of all + * one bits (so that padding bits added at the end of a compressed segment + * can't look like a valid code). Because of the canonical ordering of + * codewords, this just means that there must be an unused slot in the + * longest codeword length category. Section K.2 of the JPEG spec suggests + * reserving such a slot by pretending that symbol 256 is a valid symbol + * with count 1. In theory that's not optimal; giving it count zero but + * including it in the symbol set anyway should give a better Huffman code. + * But the theoretically better code actually seems to come out worse in + * practice, because it produces more all-ones bytes (which incur stuffed + * zero bytes in the final file). In any case the difference is tiny. + * + * The JPEG standard requires Huffman codes to be no more than 16 bits long. + * If some symbols have a very small but nonzero probability, the Huffman tree + * must be adjusted to meet the code length restriction. We currently use + * the adjustment method suggested in JPEG section K.2. This method is *not* + * optimal; it may not choose the best possible limited-length code. But + * typically only very-low-frequency symbols will be given less-than-optimal + * lengths, so the code is almost optimal. Experimental comparisons against + * an optimal limited-length-code algorithm indicate that the difference is + * microscopic --- usually less than a hundredth of a percent of total size. + * So the extra complexity of an optimal algorithm doesn't seem worthwhile. + */ + +GLOBAL(void) +jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[]) +{ +#define MAX_CLEN 32 /* assumed maximum initial code length */ + UINT8 bits[MAX_CLEN+1]; /* bits[k] = # of symbols with code length k */ + int codesize[257]; /* codesize[k] = code length of symbol k */ + int others[257]; /* next symbol in current branch of tree */ + int c1, c2; + int p, i, j; + long v; + + /* This algorithm is explained in section K.2 of the JPEG standard */ + + MEMZERO(bits, sizeof(bits)); + MEMZERO(codesize, sizeof(codesize)); + for (i = 0; i < 257; i++) + others[i] = -1; /* init links to empty */ + + freq[256] = 1; /* make sure 256 has a nonzero count */ + /* Including the pseudo-symbol 256 in the Huffman procedure guarantees + * that no real symbol is given code-value of all ones, because 256 + * will be placed last in the largest codeword category. + */ + + /* Huffman's basic algorithm to assign optimal code lengths to symbols */ + + for (;;) { + /* Find the smallest nonzero frequency, set c1 = its symbol */ + /* In case of ties, take the larger symbol number */ + c1 = -1; + v = 1000000000L; + for (i = 0; i <= 256; i++) { + if (freq[i] && freq[i] <= v) { + v = freq[i]; + c1 = i; + } + } + + /* Find the next smallest nonzero frequency, set c2 = its symbol */ + /* In case of ties, take the larger symbol number */ + c2 = -1; + v = 1000000000L; + for (i = 0; i <= 256; i++) { + if (freq[i] && freq[i] <= v && i != c1) { + v = freq[i]; + c2 = i; + } + } + + /* Done if we've merged everything into one frequency */ + if (c2 < 0) + break; + + /* Else merge the two counts/trees */ + freq[c1] += freq[c2]; + freq[c2] = 0; + + /* Increment the codesize of everything in c1's tree branch */ + codesize[c1]++; + while (others[c1] >= 0) { + c1 = others[c1]; + codesize[c1]++; + } + + others[c1] = c2; /* chain c2 onto c1's tree branch */ + + /* Increment the codesize of everything in c2's tree branch */ + codesize[c2]++; + while (others[c2] >= 0) { + c2 = others[c2]; + codesize[c2]++; + } + } + + /* Now count the number of symbols of each code length */ + for (i = 0; i <= 256; i++) { + if (codesize[i]) { + /* The JPEG standard seems to think that this can't happen, */ + /* but I'm paranoid... */ + if (codesize[i] > MAX_CLEN) + ERREXIT(cinfo, JERR_HUFF_CLEN_OVERFLOW); + + bits[codesize[i]]++; + } + } + + /* JPEG doesn't allow symbols with code lengths over 16 bits, so if the pure + * Huffman procedure assigned any such lengths, we must adjust the coding. + * Here is what the JPEG spec says about how this next bit works: + * Since symbols are paired for the longest Huffman code, the symbols are + * removed from this length category two at a time. The prefix for the pair + * (which is one bit shorter) is allocated to one of the pair; then, + * skipping the BITS entry for that prefix length, a code word from the next + * shortest nonzero BITS entry is converted into a prefix for two code words + * one bit longer. + */ + + for (i = MAX_CLEN; i > 16; i--) { + while (bits[i] > 0) { + j = i - 2; /* find length of new prefix to be used */ + while (bits[j] == 0) + j--; + + bits[i] -= 2; /* remove two symbols */ + bits[i-1]++; /* one goes in this length */ + bits[j+1] += 2; /* two new symbols in this length */ + bits[j]--; /* symbol of this length is now a prefix */ + } + } + + /* Remove the count for the pseudo-symbol 256 from the largest codelength */ + while (bits[i] == 0) /* find largest codelength still in use */ + i--; + bits[i]--; + + /* Return final symbol counts (only for lengths 0..16) */ + MEMCOPY(htbl->bits, bits, sizeof(htbl->bits)); + + /* Return a list of the symbols sorted by code length */ + /* It's not real clear to me why we don't need to consider the codelength + * changes made above, but the JPEG spec seems to think this works. + */ + p = 0; + for (i = 1; i <= MAX_CLEN; i++) { + for (j = 0; j <= 255; j++) { + if (codesize[j] == i) { + htbl->huffval[p] = (UINT8) j; + p++; + } + } + } + + /* Set sent_table FALSE so updated table will be written to JPEG file. */ + htbl->sent_table = FALSE; +} + + +/* + * Finish up a statistics-gathering pass and create the new Huffman tables. + */ + +METHODDEF(void) +finish_pass_gather (j_compress_ptr cinfo) +{ + huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy; + int ci, dctbl, actbl; + jpeg_component_info *compptr; + JHUFF_TBL **htblptr; + boolean did_dc[NUM_HUFF_TBLS]; + boolean did_ac[NUM_HUFF_TBLS]; + + /* It's important not to apply jpeg_gen_optimal_table more than once + * per table, because it clobbers the input frequency counts! + */ + MEMZERO(did_dc, sizeof(did_dc)); + MEMZERO(did_ac, sizeof(did_ac)); + + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { + compptr = cinfo->cur_comp_info[ci]; + dctbl = compptr->dc_tbl_no; + actbl = compptr->ac_tbl_no; + if (! did_dc[dctbl]) { + htblptr = & cinfo->dc_huff_tbl_ptrs[dctbl]; + if (*htblptr == NULL) + *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo); + jpeg_gen_optimal_table(cinfo, *htblptr, entropy->dc_count_ptrs[dctbl]); + did_dc[dctbl] = TRUE; + } + if (! did_ac[actbl]) { + htblptr = & cinfo->ac_huff_tbl_ptrs[actbl]; + if (*htblptr == NULL) + *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo); + jpeg_gen_optimal_table(cinfo, *htblptr, entropy->ac_count_ptrs[actbl]); + did_ac[actbl] = TRUE; + } + } +} + + +#endif /* ENTROPY_OPT_SUPPORTED */ + + +/* + * Module initialization routine for Huffman entropy encoding. + */ + +GLOBAL(void) +jinit_huff_encoder (j_compress_ptr cinfo) +{ + huff_entropy_ptr entropy; + int i; + + entropy = (huff_entropy_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(huff_entropy_encoder)); + cinfo->entropy = (struct jpeg_entropy_encoder *) entropy; + entropy->pub.start_pass = start_pass_huff; + + /* Mark tables unallocated */ + for (i = 0; i < NUM_HUFF_TBLS; i++) { + entropy->dc_derived_tbls[i] = entropy->ac_derived_tbls[i] = NULL; +#ifdef ENTROPY_OPT_SUPPORTED + entropy->dc_count_ptrs[i] = entropy->ac_count_ptrs[i] = NULL; +#endif + } +} diff --git a/media/libjpeg/jchuff.h b/media/libjpeg/jchuff.h new file mode 100644 index 000000000..4236089ad --- /dev/null +++ b/media/libjpeg/jchuff.h @@ -0,0 +1,43 @@ +/* + * jchuff.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1997, Thomas G. Lane. + * It was modified by The libjpeg-turbo Project to include only code relevant + * to libjpeg-turbo. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains declarations for Huffman entropy encoding routines + * that are shared between the sequential encoder (jchuff.c) and the + * progressive encoder (jcphuff.c). No other modules need to see these. + */ + +/* The legal range of a DCT coefficient is + * -1024 .. +1023 for 8-bit data; + * -16384 .. +16383 for 12-bit data. + * Hence the magnitude should always fit in 10 or 14 bits respectively. + */ + +#if BITS_IN_JSAMPLE == 8 +#define MAX_COEF_BITS 10 +#else +#define MAX_COEF_BITS 14 +#endif + +/* Derived data constructed for each Huffman table */ + +typedef struct { + unsigned int ehufco[256]; /* code for each symbol */ + char ehufsi[256]; /* length of code for each symbol */ + /* If no code has been allocated for a symbol S, ehufsi[S] contains 0 */ +} c_derived_tbl; + +/* Expand a Huffman table definition into the derived format */ +EXTERN(void) jpeg_make_c_derived_tbl + (j_compress_ptr cinfo, boolean isDC, int tblno, + c_derived_tbl ** pdtbl); + +/* Generate an optimal table definition given the specified counts */ +EXTERN(void) jpeg_gen_optimal_table + (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[]); diff --git a/media/libjpeg/jcinit.c b/media/libjpeg/jcinit.c new file mode 100644 index 000000000..463bd8c6d --- /dev/null +++ b/media/libjpeg/jcinit.c @@ -0,0 +1,77 @@ +/* + * jcinit.c + * + * Copyright (C) 1991-1997, Thomas G. Lane. + * This file is part of the Independent JPEG Group's software. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains initialization logic for the JPEG compressor. + * This routine is in charge of selecting the modules to be executed and + * making an initialization call to each one. + * + * Logically, this code belongs in jcmaster.c. It's split out because + * linking this routine implies linking the entire compression library. + * For a transcoding-only application, we want to be able to use jcmaster.c + * without linking in the whole library. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" + + +/* + * Master selection of compression modules. + * This is done once at the start of processing an image. We determine + * which modules will be used and give them appropriate initialization calls. + */ + +GLOBAL(void) +jinit_compress_master (j_compress_ptr cinfo) +{ + /* Initialize master control (includes parameter checking/processing) */ + jinit_c_master_control(cinfo, FALSE /* full compression */); + + /* Preprocessing */ + if (! cinfo->raw_data_in) { + jinit_color_converter(cinfo); + jinit_downsampler(cinfo); + jinit_c_prep_controller(cinfo, FALSE /* never need full buffer here */); + } + /* Forward DCT */ + jinit_forward_dct(cinfo); + /* Entropy encoding: either Huffman or arithmetic coding. */ + if (cinfo->arith_code) { +#ifdef C_ARITH_CODING_SUPPORTED + jinit_arith_encoder(cinfo); +#else + ERREXIT(cinfo, JERR_ARITH_NOTIMPL); +#endif + } else { + if (cinfo->progressive_mode) { +#ifdef C_PROGRESSIVE_SUPPORTED + jinit_phuff_encoder(cinfo); +#else + ERREXIT(cinfo, JERR_NOT_COMPILED); +#endif + } else + jinit_huff_encoder(cinfo); + } + + /* Need a full-image coefficient buffer in any multi-pass mode. */ + jinit_c_coef_controller(cinfo, + (boolean) (cinfo->num_scans > 1 || cinfo->optimize_coding)); + jinit_c_main_controller(cinfo, FALSE /* never need full buffer here */); + + jinit_marker_writer(cinfo); + + /* We can now tell the memory manager to allocate virtual arrays. */ + (*cinfo->mem->realize_virt_arrays) ((j_common_ptr) cinfo); + + /* Write the datastream header (SOI) immediately. + * Frame and scan headers are postponed till later. + * This lets application insert special markers after the SOI. + */ + (*cinfo->marker->write_file_header) (cinfo); +} diff --git a/media/libjpeg/jcmainct.c b/media/libjpeg/jcmainct.c new file mode 100644 index 000000000..d01f46364 --- /dev/null +++ b/media/libjpeg/jcmainct.c @@ -0,0 +1,162 @@ +/* + * jcmainct.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1994-1996, Thomas G. Lane. + * It was modified by The libjpeg-turbo Project to include only code relevant + * to libjpeg-turbo. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains the main buffer controller for compression. + * The main buffer lies between the pre-processor and the JPEG + * compressor proper; it holds downsampled data in the JPEG colorspace. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" + + +/* Private buffer controller object */ + +typedef struct { + struct jpeg_c_main_controller pub; /* public fields */ + + JDIMENSION cur_iMCU_row; /* number of current iMCU row */ + JDIMENSION rowgroup_ctr; /* counts row groups received in iMCU row */ + boolean suspended; /* remember if we suspended output */ + J_BUF_MODE pass_mode; /* current operating mode */ + + /* If using just a strip buffer, this points to the entire set of buffers + * (we allocate one for each component). In the full-image case, this + * points to the currently accessible strips of the virtual arrays. + */ + JSAMPARRAY buffer[MAX_COMPONENTS]; +} my_main_controller; + +typedef my_main_controller *my_main_ptr; + + +/* Forward declarations */ +METHODDEF(void) process_data_simple_main + (j_compress_ptr cinfo, JSAMPARRAY input_buf, JDIMENSION *in_row_ctr, + JDIMENSION in_rows_avail); + + +/* + * Initialize for a processing pass. + */ + +METHODDEF(void) +start_pass_main (j_compress_ptr cinfo, J_BUF_MODE pass_mode) +{ + my_main_ptr main_ptr = (my_main_ptr) cinfo->main; + + /* Do nothing in raw-data mode. */ + if (cinfo->raw_data_in) + return; + + if (pass_mode != JBUF_PASS_THRU) + ERREXIT(cinfo, JERR_BAD_BUFFER_MODE); + + main_ptr->cur_iMCU_row = 0; /* initialize counters */ + main_ptr->rowgroup_ctr = 0; + main_ptr->suspended = FALSE; + main_ptr->pass_mode = pass_mode; /* save mode for use by process_data */ + main_ptr->pub.process_data = process_data_simple_main; +} + + +/* + * Process some data. + * This routine handles the simple pass-through mode, + * where we have only a strip buffer. + */ + +METHODDEF(void) +process_data_simple_main (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JDIMENSION *in_row_ctr, + JDIMENSION in_rows_avail) +{ + my_main_ptr main_ptr = (my_main_ptr) cinfo->main; + + while (main_ptr->cur_iMCU_row < cinfo->total_iMCU_rows) { + /* Read input data if we haven't filled the main buffer yet */ + if (main_ptr->rowgroup_ctr < DCTSIZE) + (*cinfo->prep->pre_process_data) (cinfo, + input_buf, in_row_ctr, in_rows_avail, + main_ptr->buffer, &main_ptr->rowgroup_ctr, + (JDIMENSION) DCTSIZE); + + /* If we don't have a full iMCU row buffered, return to application for + * more data. Note that preprocessor will always pad to fill the iMCU row + * at the bottom of the image. + */ + if (main_ptr->rowgroup_ctr != DCTSIZE) + return; + + /* Send the completed row to the compressor */ + if (! (*cinfo->coef->compress_data) (cinfo, main_ptr->buffer)) { + /* If compressor did not consume the whole row, then we must need to + * suspend processing and return to the application. In this situation + * we pretend we didn't yet consume the last input row; otherwise, if + * it happened to be the last row of the image, the application would + * think we were done. + */ + if (! main_ptr->suspended) { + (*in_row_ctr)--; + main_ptr->suspended = TRUE; + } + return; + } + /* We did finish the row. Undo our little suspension hack if a previous + * call suspended; then mark the main buffer empty. + */ + if (main_ptr->suspended) { + (*in_row_ctr)++; + main_ptr->suspended = FALSE; + } + main_ptr->rowgroup_ctr = 0; + main_ptr->cur_iMCU_row++; + } +} + + +/* + * Initialize main buffer controller. + */ + +GLOBAL(void) +jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer) +{ + my_main_ptr main_ptr; + int ci; + jpeg_component_info *compptr; + + main_ptr = (my_main_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(my_main_controller)); + cinfo->main = (struct jpeg_c_main_controller *) main_ptr; + main_ptr->pub.start_pass = start_pass_main; + + /* We don't need to create a buffer in raw-data mode. */ + if (cinfo->raw_data_in) + return; + + /* Create the buffer. It holds downsampled data, so each component + * may be of a different size. + */ + if (need_full_buffer) { + ERREXIT(cinfo, JERR_BAD_BUFFER_MODE); + } else { + /* Allocate a strip buffer for each component */ + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + main_ptr->buffer[ci] = (*cinfo->mem->alloc_sarray) + ((j_common_ptr) cinfo, JPOOL_IMAGE, + compptr->width_in_blocks * DCTSIZE, + (JDIMENSION) (compptr->v_samp_factor * DCTSIZE)); + } + } +} diff --git a/media/libjpeg/jcmarker.c b/media/libjpeg/jcmarker.c new file mode 100644 index 000000000..463f66592 --- /dev/null +++ b/media/libjpeg/jcmarker.c @@ -0,0 +1,665 @@ +/* + * jcmarker.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1998, Thomas G. Lane. + * Modified 2003-2010 by Guido Vollbeding. + * libjpeg-turbo Modifications: + * Copyright (C) 2010, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains routines to write JPEG datastream markers. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jpegcomp.h" + + +typedef enum { /* JPEG marker codes */ + M_SOF0 = 0xc0, + M_SOF1 = 0xc1, + M_SOF2 = 0xc2, + M_SOF3 = 0xc3, + + M_SOF5 = 0xc5, + M_SOF6 = 0xc6, + M_SOF7 = 0xc7, + + M_JPG = 0xc8, + M_SOF9 = 0xc9, + M_SOF10 = 0xca, + M_SOF11 = 0xcb, + + M_SOF13 = 0xcd, + M_SOF14 = 0xce, + M_SOF15 = 0xcf, + + M_DHT = 0xc4, + + M_DAC = 0xcc, + + M_RST0 = 0xd0, + M_RST1 = 0xd1, + M_RST2 = 0xd2, + M_RST3 = 0xd3, + M_RST4 = 0xd4, + M_RST5 = 0xd5, + M_RST6 = 0xd6, + M_RST7 = 0xd7, + + M_SOI = 0xd8, + M_EOI = 0xd9, + M_SOS = 0xda, + M_DQT = 0xdb, + M_DNL = 0xdc, + M_DRI = 0xdd, + M_DHP = 0xde, + M_EXP = 0xdf, + + M_APP0 = 0xe0, + M_APP1 = 0xe1, + M_APP2 = 0xe2, + M_APP3 = 0xe3, + M_APP4 = 0xe4, + M_APP5 = 0xe5, + M_APP6 = 0xe6, + M_APP7 = 0xe7, + M_APP8 = 0xe8, + M_APP9 = 0xe9, + M_APP10 = 0xea, + M_APP11 = 0xeb, + M_APP12 = 0xec, + M_APP13 = 0xed, + M_APP14 = 0xee, + M_APP15 = 0xef, + + M_JPG0 = 0xf0, + M_JPG13 = 0xfd, + M_COM = 0xfe, + + M_TEM = 0x01, + + M_ERROR = 0x100 +} JPEG_MARKER; + + +/* Private state */ + +typedef struct { + struct jpeg_marker_writer pub; /* public fields */ + + unsigned int last_restart_interval; /* last DRI value emitted; 0 after SOI */ +} my_marker_writer; + +typedef my_marker_writer *my_marker_ptr; + + +/* + * Basic output routines. + * + * Note that we do not support suspension while writing a marker. + * Therefore, an application using suspension must ensure that there is + * enough buffer space for the initial markers (typ. 600-700 bytes) before + * calling jpeg_start_compress, and enough space to write the trailing EOI + * (a few bytes) before calling jpeg_finish_compress. Multipass compression + * modes are not supported at all with suspension, so those two are the only + * points where markers will be written. + */ + +LOCAL(void) +emit_byte (j_compress_ptr cinfo, int val) +/* Emit a byte */ +{ + struct jpeg_destination_mgr *dest = cinfo->dest; + + *(dest->next_output_byte)++ = (JOCTET) val; + if (--dest->free_in_buffer == 0) { + if (! (*dest->empty_output_buffer) (cinfo)) + ERREXIT(cinfo, JERR_CANT_SUSPEND); + } +} + + +LOCAL(void) +emit_marker (j_compress_ptr cinfo, JPEG_MARKER mark) +/* Emit a marker code */ +{ + emit_byte(cinfo, 0xFF); + emit_byte(cinfo, (int) mark); +} + + +LOCAL(void) +emit_2bytes (j_compress_ptr cinfo, int value) +/* Emit a 2-byte integer; these are always MSB first in JPEG files */ +{ + emit_byte(cinfo, (value >> 8) & 0xFF); + emit_byte(cinfo, value & 0xFF); +} + + +/* + * Routines to write specific marker types. + */ + +LOCAL(int) +emit_dqt (j_compress_ptr cinfo, int index) +/* Emit a DQT marker */ +/* Returns the precision used (0 = 8bits, 1 = 16bits) for baseline checking */ +{ + JQUANT_TBL *qtbl = cinfo->quant_tbl_ptrs[index]; + int prec; + int i; + + if (qtbl == NULL) + ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, index); + + prec = 0; + for (i = 0; i < DCTSIZE2; i++) { + if (qtbl->quantval[i] > 255) + prec = 1; + } + + if (! qtbl->sent_table) { + emit_marker(cinfo, M_DQT); + + emit_2bytes(cinfo, prec ? DCTSIZE2*2 + 1 + 2 : DCTSIZE2 + 1 + 2); + + emit_byte(cinfo, index + (prec<<4)); + + for (i = 0; i < DCTSIZE2; i++) { + /* The table entries must be emitted in zigzag order. */ + unsigned int qval = qtbl->quantval[jpeg_natural_order[i]]; + if (prec) + emit_byte(cinfo, (int) (qval >> 8)); + emit_byte(cinfo, (int) (qval & 0xFF)); + } + + qtbl->sent_table = TRUE; + } + + return prec; +} + + +LOCAL(void) +emit_dht (j_compress_ptr cinfo, int index, boolean is_ac) +/* Emit a DHT marker */ +{ + JHUFF_TBL *htbl; + int length, i; + + if (is_ac) { + htbl = cinfo->ac_huff_tbl_ptrs[index]; + index += 0x10; /* output index has AC bit set */ + } else { + htbl = cinfo->dc_huff_tbl_ptrs[index]; + } + + if (htbl == NULL) + ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, index); + + if (! htbl->sent_table) { + emit_marker(cinfo, M_DHT); + + length = 0; + for (i = 1; i <= 16; i++) + length += htbl->bits[i]; + + emit_2bytes(cinfo, length + 2 + 1 + 16); + emit_byte(cinfo, index); + + for (i = 1; i <= 16; i++) + emit_byte(cinfo, htbl->bits[i]); + + for (i = 0; i < length; i++) + emit_byte(cinfo, htbl->huffval[i]); + + htbl->sent_table = TRUE; + } +} + + +LOCAL(void) +emit_dac (j_compress_ptr cinfo) +/* Emit a DAC marker */ +/* Since the useful info is so small, we want to emit all the tables in */ +/* one DAC marker. Therefore this routine does its own scan of the table. */ +{ +#ifdef C_ARITH_CODING_SUPPORTED + char dc_in_use[NUM_ARITH_TBLS]; + char ac_in_use[NUM_ARITH_TBLS]; + int length, i; + jpeg_component_info *compptr; + + for (i = 0; i < NUM_ARITH_TBLS; i++) + dc_in_use[i] = ac_in_use[i] = 0; + + for (i = 0; i < cinfo->comps_in_scan; i++) { + compptr = cinfo->cur_comp_info[i]; + /* DC needs no table for refinement scan */ + if (cinfo->Ss == 0 && cinfo->Ah == 0) + dc_in_use[compptr->dc_tbl_no] = 1; + /* AC needs no table when not present */ + if (cinfo->Se) + ac_in_use[compptr->ac_tbl_no] = 1; + } + + length = 0; + for (i = 0; i < NUM_ARITH_TBLS; i++) + length += dc_in_use[i] + ac_in_use[i]; + + if (length) { + emit_marker(cinfo, M_DAC); + + emit_2bytes(cinfo, length*2 + 2); + + for (i = 0; i < NUM_ARITH_TBLS; i++) { + if (dc_in_use[i]) { + emit_byte(cinfo, i); + emit_byte(cinfo, cinfo->arith_dc_L[i] + (cinfo->arith_dc_U[i]<<4)); + } + if (ac_in_use[i]) { + emit_byte(cinfo, i + 0x10); + emit_byte(cinfo, cinfo->arith_ac_K[i]); + } + } + } +#endif /* C_ARITH_CODING_SUPPORTED */ +} + + +LOCAL(void) +emit_dri (j_compress_ptr cinfo) +/* Emit a DRI marker */ +{ + emit_marker(cinfo, M_DRI); + + emit_2bytes(cinfo, 4); /* fixed length */ + + emit_2bytes(cinfo, (int) cinfo->restart_interval); +} + + +LOCAL(void) +emit_sof (j_compress_ptr cinfo, JPEG_MARKER code) +/* Emit a SOF marker */ +{ + int ci; + jpeg_component_info *compptr; + + emit_marker(cinfo, code); + + emit_2bytes(cinfo, 3 * cinfo->num_components + 2 + 5 + 1); /* length */ + + /* Make sure image isn't bigger than SOF field can handle */ + if ((long) cinfo->_jpeg_height > 65535L || + (long) cinfo->_jpeg_width > 65535L) + ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) 65535); + + emit_byte(cinfo, cinfo->data_precision); + emit_2bytes(cinfo, (int) cinfo->_jpeg_height); + emit_2bytes(cinfo, (int) cinfo->_jpeg_width); + + emit_byte(cinfo, cinfo->num_components); + + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + emit_byte(cinfo, compptr->component_id); + emit_byte(cinfo, (compptr->h_samp_factor << 4) + compptr->v_samp_factor); + emit_byte(cinfo, compptr->quant_tbl_no); + } +} + + +LOCAL(void) +emit_sos (j_compress_ptr cinfo) +/* Emit a SOS marker */ +{ + int i, td, ta; + jpeg_component_info *compptr; + + emit_marker(cinfo, M_SOS); + + emit_2bytes(cinfo, 2 * cinfo->comps_in_scan + 2 + 1 + 3); /* length */ + + emit_byte(cinfo, cinfo->comps_in_scan); + + for (i = 0; i < cinfo->comps_in_scan; i++) { + compptr = cinfo->cur_comp_info[i]; + emit_byte(cinfo, compptr->component_id); + + /* We emit 0 for unused field(s); this is recommended by the P&M text + * but does not seem to be specified in the standard. + */ + + /* DC needs no table for refinement scan */ + td = cinfo->Ss == 0 && cinfo->Ah == 0 ? compptr->dc_tbl_no : 0; + /* AC needs no table when not present */ + ta = cinfo->Se ? compptr->ac_tbl_no : 0; + + emit_byte(cinfo, (td << 4) + ta); + } + + emit_byte(cinfo, cinfo->Ss); + emit_byte(cinfo, cinfo->Se); + emit_byte(cinfo, (cinfo->Ah << 4) + cinfo->Al); +} + + +LOCAL(void) +emit_jfif_app0 (j_compress_ptr cinfo) +/* Emit a JFIF-compliant APP0 marker */ +{ + /* + * Length of APP0 block (2 bytes) + * Block ID (4 bytes - ASCII "JFIF") + * Zero byte (1 byte to terminate the ID string) + * Version Major, Minor (2 bytes - major first) + * Units (1 byte - 0x00 = none, 0x01 = inch, 0x02 = cm) + * Xdpu (2 bytes - dots per unit horizontal) + * Ydpu (2 bytes - dots per unit vertical) + * Thumbnail X size (1 byte) + * Thumbnail Y size (1 byte) + */ + + emit_marker(cinfo, M_APP0); + + emit_2bytes(cinfo, 2 + 4 + 1 + 2 + 1 + 2 + 2 + 1 + 1); /* length */ + + emit_byte(cinfo, 0x4A); /* Identifier: ASCII "JFIF" */ + emit_byte(cinfo, 0x46); + emit_byte(cinfo, 0x49); + emit_byte(cinfo, 0x46); + emit_byte(cinfo, 0); + emit_byte(cinfo, cinfo->JFIF_major_version); /* Version fields */ + emit_byte(cinfo, cinfo->JFIF_minor_version); + emit_byte(cinfo, cinfo->density_unit); /* Pixel size information */ + emit_2bytes(cinfo, (int) cinfo->X_density); + emit_2bytes(cinfo, (int) cinfo->Y_density); + emit_byte(cinfo, 0); /* No thumbnail image */ + emit_byte(cinfo, 0); +} + + +LOCAL(void) +emit_adobe_app14 (j_compress_ptr cinfo) +/* Emit an Adobe APP14 marker */ +{ + /* + * Length of APP14 block (2 bytes) + * Block ID (5 bytes - ASCII "Adobe") + * Version Number (2 bytes - currently 100) + * Flags0 (2 bytes - currently 0) + * Flags1 (2 bytes - currently 0) + * Color transform (1 byte) + * + * Although Adobe TN 5116 mentions Version = 101, all the Adobe files + * now in circulation seem to use Version = 100, so that's what we write. + * + * We write the color transform byte as 1 if the JPEG color space is + * YCbCr, 2 if it's YCCK, 0 otherwise. Adobe's definition has to do with + * whether the encoder performed a transformation, which is pretty useless. + */ + + emit_marker(cinfo, M_APP14); + + emit_2bytes(cinfo, 2 + 5 + 2 + 2 + 2 + 1); /* length */ + + emit_byte(cinfo, 0x41); /* Identifier: ASCII "Adobe" */ + emit_byte(cinfo, 0x64); + emit_byte(cinfo, 0x6F); + emit_byte(cinfo, 0x62); + emit_byte(cinfo, 0x65); + emit_2bytes(cinfo, 100); /* Version */ + emit_2bytes(cinfo, 0); /* Flags0 */ + emit_2bytes(cinfo, 0); /* Flags1 */ + switch (cinfo->jpeg_color_space) { + case JCS_YCbCr: + emit_byte(cinfo, 1); /* Color transform = 1 */ + break; + case JCS_YCCK: + emit_byte(cinfo, 2); /* Color transform = 2 */ + break; + default: + emit_byte(cinfo, 0); /* Color transform = 0 */ + break; + } +} + + +/* + * These routines allow writing an arbitrary marker with parameters. + * The only intended use is to emit COM or APPn markers after calling + * write_file_header and before calling write_frame_header. + * Other uses are not guaranteed to produce desirable results. + * Counting the parameter bytes properly is the caller's responsibility. + */ + +METHODDEF(void) +write_marker_header (j_compress_ptr cinfo, int marker, unsigned int datalen) +/* Emit an arbitrary marker header */ +{ + if (datalen > (unsigned int) 65533) /* safety check */ + ERREXIT(cinfo, JERR_BAD_LENGTH); + + emit_marker(cinfo, (JPEG_MARKER) marker); + + emit_2bytes(cinfo, (int) (datalen + 2)); /* total length */ +} + +METHODDEF(void) +write_marker_byte (j_compress_ptr cinfo, int val) +/* Emit one byte of marker parameters following write_marker_header */ +{ + emit_byte(cinfo, val); +} + + +/* + * Write datastream header. + * This consists of an SOI and optional APPn markers. + * We recommend use of the JFIF marker, but not the Adobe marker, + * when using YCbCr or grayscale data. The JFIF marker should NOT + * be used for any other JPEG colorspace. The Adobe marker is helpful + * to distinguish RGB, CMYK, and YCCK colorspaces. + * Note that an application can write additional header markers after + * jpeg_start_compress returns. + */ + +METHODDEF(void) +write_file_header (j_compress_ptr cinfo) +{ + my_marker_ptr marker = (my_marker_ptr) cinfo->marker; + + emit_marker(cinfo, M_SOI); /* first the SOI */ + + /* SOI is defined to reset restart interval to 0 */ + marker->last_restart_interval = 0; + + if (cinfo->write_JFIF_header) /* next an optional JFIF APP0 */ + emit_jfif_app0(cinfo); + if (cinfo->write_Adobe_marker) /* next an optional Adobe APP14 */ + emit_adobe_app14(cinfo); +} + + +/* + * Write frame header. + * This consists of DQT and SOFn markers. + * Note that we do not emit the SOF until we have emitted the DQT(s). + * This avoids compatibility problems with incorrect implementations that + * try to error-check the quant table numbers as soon as they see the SOF. + */ + +METHODDEF(void) +write_frame_header (j_compress_ptr cinfo) +{ + int ci, prec; + boolean is_baseline; + jpeg_component_info *compptr; + + /* Emit DQT for each quantization table. + * Note that emit_dqt() suppresses any duplicate tables. + */ + prec = 0; + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + prec += emit_dqt(cinfo, compptr->quant_tbl_no); + } + /* now prec is nonzero iff there are any 16-bit quant tables. */ + + /* Check for a non-baseline specification. + * Note we assume that Huffman table numbers won't be changed later. + */ + if (cinfo->arith_code || cinfo->progressive_mode || + cinfo->data_precision != 8) { + is_baseline = FALSE; + } else { + is_baseline = TRUE; + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + if (compptr->dc_tbl_no > 1 || compptr->ac_tbl_no > 1) + is_baseline = FALSE; + } + if (prec && is_baseline) { + is_baseline = FALSE; + /* If it's baseline except for quantizer size, warn the user */ + TRACEMS(cinfo, 0, JTRC_16BIT_TABLES); + } + } + + /* Emit the proper SOF marker */ + if (cinfo->arith_code) { + if (cinfo->progressive_mode) + emit_sof(cinfo, M_SOF10); /* SOF code for progressive arithmetic */ + else + emit_sof(cinfo, M_SOF9); /* SOF code for sequential arithmetic */ + } else { + if (cinfo->progressive_mode) + emit_sof(cinfo, M_SOF2); /* SOF code for progressive Huffman */ + else if (is_baseline) + emit_sof(cinfo, M_SOF0); /* SOF code for baseline implementation */ + else + emit_sof(cinfo, M_SOF1); /* SOF code for non-baseline Huffman file */ + } +} + + +/* + * Write scan header. + * This consists of DHT or DAC markers, optional DRI, and SOS. + * Compressed data will be written following the SOS. + */ + +METHODDEF(void) +write_scan_header (j_compress_ptr cinfo) +{ + my_marker_ptr marker = (my_marker_ptr) cinfo->marker; + int i; + jpeg_component_info *compptr; + + if (cinfo->arith_code) { + /* Emit arith conditioning info. We may have some duplication + * if the file has multiple scans, but it's so small it's hardly + * worth worrying about. + */ + emit_dac(cinfo); + } else { + /* Emit Huffman tables. + * Note that emit_dht() suppresses any duplicate tables. + */ + for (i = 0; i < cinfo->comps_in_scan; i++) { + compptr = cinfo->cur_comp_info[i]; + /* DC needs no table for refinement scan */ + if (cinfo->Ss == 0 && cinfo->Ah == 0) + emit_dht(cinfo, compptr->dc_tbl_no, FALSE); + /* AC needs no table when not present */ + if (cinfo->Se) + emit_dht(cinfo, compptr->ac_tbl_no, TRUE); + } + } + + /* Emit DRI if required --- note that DRI value could change for each scan. + * We avoid wasting space with unnecessary DRIs, however. + */ + if (cinfo->restart_interval != marker->last_restart_interval) { + emit_dri(cinfo); + marker->last_restart_interval = cinfo->restart_interval; + } + + emit_sos(cinfo); +} + + +/* + * Write datastream trailer. + */ + +METHODDEF(void) +write_file_trailer (j_compress_ptr cinfo) +{ + emit_marker(cinfo, M_EOI); +} + + +/* + * Write an abbreviated table-specification datastream. + * This consists of SOI, DQT and DHT tables, and EOI. + * Any table that is defined and not marked sent_table = TRUE will be + * emitted. Note that all tables will be marked sent_table = TRUE at exit. + */ + +METHODDEF(void) +write_tables_only (j_compress_ptr cinfo) +{ + int i; + + emit_marker(cinfo, M_SOI); + + for (i = 0; i < NUM_QUANT_TBLS; i++) { + if (cinfo->quant_tbl_ptrs[i] != NULL) + (void) emit_dqt(cinfo, i); + } + + if (! cinfo->arith_code) { + for (i = 0; i < NUM_HUFF_TBLS; i++) { + if (cinfo->dc_huff_tbl_ptrs[i] != NULL) + emit_dht(cinfo, i, FALSE); + if (cinfo->ac_huff_tbl_ptrs[i] != NULL) + emit_dht(cinfo, i, TRUE); + } + } + + emit_marker(cinfo, M_EOI); +} + + +/* + * Initialize the marker writer module. + */ + +GLOBAL(void) +jinit_marker_writer (j_compress_ptr cinfo) +{ + my_marker_ptr marker; + + /* Create the subobject */ + marker = (my_marker_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(my_marker_writer)); + cinfo->marker = (struct jpeg_marker_writer *) marker; + /* Initialize method pointers */ + marker->pub.write_file_header = write_file_header; + marker->pub.write_frame_header = write_frame_header; + marker->pub.write_scan_header = write_scan_header; + marker->pub.write_file_trailer = write_file_trailer; + marker->pub.write_tables_only = write_tables_only; + marker->pub.write_marker_header = write_marker_header; + marker->pub.write_marker_byte = write_marker_byte; + /* Initialize private state */ + marker->last_restart_interval = 0; +} diff --git a/media/libjpeg/jcmaster.c b/media/libjpeg/jcmaster.c new file mode 100644 index 000000000..03a8b40ea --- /dev/null +++ b/media/libjpeg/jcmaster.c @@ -0,0 +1,639 @@ +/* + * jcmaster.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1997, Thomas G. Lane. + * Modified 2003-2010 by Guido Vollbeding. + * libjpeg-turbo Modifications: + * Copyright (C) 2010, 2016, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains master control logic for the JPEG compressor. + * These routines are concerned with parameter validation, initial setup, + * and inter-pass control (determining the number of passes and the work + * to be done in each pass). + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jpegcomp.h" +#include "jconfigint.h" + + +/* Private state */ + +typedef enum { + main_pass, /* input data, also do first output step */ + huff_opt_pass, /* Huffman code optimization pass */ + output_pass /* data output pass */ +} c_pass_type; + +typedef struct { + struct jpeg_comp_master pub; /* public fields */ + + c_pass_type pass_type; /* the type of the current pass */ + + int pass_number; /* # of passes completed */ + int total_passes; /* total # of passes needed */ + + int scan_number; /* current index in scan_info[] */ + + /* + * This is here so we can add libjpeg-turbo version/build information to the + * global string table without introducing a new global symbol. Adding this + * information to the global string table allows one to examine a binary + * object and determine which version of libjpeg-turbo it was built from or + * linked against. + */ + const char *jpeg_version; + +} my_comp_master; + +typedef my_comp_master *my_master_ptr; + + +/* + * Support routines that do various essential calculations. + */ + +#if JPEG_LIB_VERSION >= 70 +/* + * Compute JPEG image dimensions and related values. + * NOTE: this is exported for possible use by application. + * Hence it mustn't do anything that can't be done twice. + */ + +GLOBAL(void) +jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo) +/* Do computations that are needed before master selection phase */ +{ + /* Hardwire it to "no scaling" */ + cinfo->jpeg_width = cinfo->image_width; + cinfo->jpeg_height = cinfo->image_height; + cinfo->min_DCT_h_scaled_size = DCTSIZE; + cinfo->min_DCT_v_scaled_size = DCTSIZE; +} +#endif + + +LOCAL(void) +initial_setup (j_compress_ptr cinfo, boolean transcode_only) +/* Do computations that are needed before master selection phase */ +{ + int ci; + jpeg_component_info *compptr; + long samplesperrow; + JDIMENSION jd_samplesperrow; + +#if JPEG_LIB_VERSION >= 70 +#if JPEG_LIB_VERSION >= 80 + if (!transcode_only) +#endif + jpeg_calc_jpeg_dimensions(cinfo); +#endif + + /* Sanity check on image dimensions */ + if (cinfo->_jpeg_height <= 0 || cinfo->_jpeg_width <= 0 + || cinfo->num_components <= 0 || cinfo->input_components <= 0) + ERREXIT(cinfo, JERR_EMPTY_IMAGE); + + /* Make sure image isn't bigger than I can handle */ + if ((long) cinfo->_jpeg_height > (long) JPEG_MAX_DIMENSION || + (long) cinfo->_jpeg_width > (long) JPEG_MAX_DIMENSION) + ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) JPEG_MAX_DIMENSION); + + /* Width of an input scanline must be representable as JDIMENSION. */ + samplesperrow = (long) cinfo->image_width * (long) cinfo->input_components; + jd_samplesperrow = (JDIMENSION) samplesperrow; + if ((long) jd_samplesperrow != samplesperrow) + ERREXIT(cinfo, JERR_WIDTH_OVERFLOW); + + /* For now, precision must match compiled-in value... */ + if (cinfo->data_precision != BITS_IN_JSAMPLE) + ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision); + + /* Check that number of components won't exceed internal array sizes */ + if (cinfo->num_components > MAX_COMPONENTS) + ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->num_components, + MAX_COMPONENTS); + + /* Compute maximum sampling factors; check factor validity */ + cinfo->max_h_samp_factor = 1; + cinfo->max_v_samp_factor = 1; + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + if (compptr->h_samp_factor<=0 || compptr->h_samp_factor>MAX_SAMP_FACTOR || + compptr->v_samp_factor<=0 || compptr->v_samp_factor>MAX_SAMP_FACTOR) + ERREXIT(cinfo, JERR_BAD_SAMPLING); + cinfo->max_h_samp_factor = MAX(cinfo->max_h_samp_factor, + compptr->h_samp_factor); + cinfo->max_v_samp_factor = MAX(cinfo->max_v_samp_factor, + compptr->v_samp_factor); + } + + /* Compute dimensions of components */ + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + /* Fill in the correct component_index value; don't rely on application */ + compptr->component_index = ci; + /* For compression, we never do DCT scaling. */ +#if JPEG_LIB_VERSION >= 70 + compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size = DCTSIZE; +#else + compptr->DCT_scaled_size = DCTSIZE; +#endif + /* Size in DCT blocks */ + compptr->width_in_blocks = (JDIMENSION) + jdiv_round_up((long) cinfo->_jpeg_width * (long) compptr->h_samp_factor, + (long) (cinfo->max_h_samp_factor * DCTSIZE)); + compptr->height_in_blocks = (JDIMENSION) + jdiv_round_up((long) cinfo->_jpeg_height * (long) compptr->v_samp_factor, + (long) (cinfo->max_v_samp_factor * DCTSIZE)); + /* Size in samples */ + compptr->downsampled_width = (JDIMENSION) + jdiv_round_up((long) cinfo->_jpeg_width * (long) compptr->h_samp_factor, + (long) cinfo->max_h_samp_factor); + compptr->downsampled_height = (JDIMENSION) + jdiv_round_up((long) cinfo->_jpeg_height * (long) compptr->v_samp_factor, + (long) cinfo->max_v_samp_factor); + /* Mark component needed (this flag isn't actually used for compression) */ + compptr->component_needed = TRUE; + } + + /* Compute number of fully interleaved MCU rows (number of times that + * main controller will call coefficient controller). + */ + cinfo->total_iMCU_rows = (JDIMENSION) + jdiv_round_up((long) cinfo->_jpeg_height, + (long) (cinfo->max_v_samp_factor*DCTSIZE)); +} + + +#ifdef C_MULTISCAN_FILES_SUPPORTED + +LOCAL(void) +validate_script (j_compress_ptr cinfo) +/* Verify that the scan script in cinfo->scan_info[] is valid; also + * determine whether it uses progressive JPEG, and set cinfo->progressive_mode. + */ +{ + const jpeg_scan_info *scanptr; + int scanno, ncomps, ci, coefi, thisi; + int Ss, Se, Ah, Al; + boolean component_sent[MAX_COMPONENTS]; +#ifdef C_PROGRESSIVE_SUPPORTED + int *last_bitpos_ptr; + int last_bitpos[MAX_COMPONENTS][DCTSIZE2]; + /* -1 until that coefficient has been seen; then last Al for it */ +#endif + + if (cinfo->num_scans <= 0) + ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, 0); + + /* For sequential JPEG, all scans must have Ss=0, Se=DCTSIZE2-1; + * for progressive JPEG, no scan can have this. + */ + scanptr = cinfo->scan_info; + if (scanptr->Ss != 0 || scanptr->Se != DCTSIZE2-1) { +#ifdef C_PROGRESSIVE_SUPPORTED + cinfo->progressive_mode = TRUE; + last_bitpos_ptr = & last_bitpos[0][0]; + for (ci = 0; ci < cinfo->num_components; ci++) + for (coefi = 0; coefi < DCTSIZE2; coefi++) + *last_bitpos_ptr++ = -1; +#else + ERREXIT(cinfo, JERR_NOT_COMPILED); +#endif + } else { + cinfo->progressive_mode = FALSE; + for (ci = 0; ci < cinfo->num_components; ci++) + component_sent[ci] = FALSE; + } + + for (scanno = 1; scanno <= cinfo->num_scans; scanptr++, scanno++) { + /* Validate component indexes */ + ncomps = scanptr->comps_in_scan; + if (ncomps <= 0 || ncomps > MAX_COMPS_IN_SCAN) + ERREXIT2(cinfo, JERR_COMPONENT_COUNT, ncomps, MAX_COMPS_IN_SCAN); + for (ci = 0; ci < ncomps; ci++) { + thisi = scanptr->component_index[ci]; + if (thisi < 0 || thisi >= cinfo->num_components) + ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno); + /* Components must appear in SOF order within each scan */ + if (ci > 0 && thisi <= scanptr->component_index[ci-1]) + ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno); + } + /* Validate progression parameters */ + Ss = scanptr->Ss; + Se = scanptr->Se; + Ah = scanptr->Ah; + Al = scanptr->Al; + if (cinfo->progressive_mode) { +#ifdef C_PROGRESSIVE_SUPPORTED + /* The JPEG spec simply gives the ranges 0..13 for Ah and Al, but that + * seems wrong: the upper bound ought to depend on data precision. + * Perhaps they really meant 0..N+1 for N-bit precision. + * Here we allow 0..10 for 8-bit data; Al larger than 10 results in + * out-of-range reconstructed DC values during the first DC scan, + * which might cause problems for some decoders. + */ +#if BITS_IN_JSAMPLE == 8 +#define MAX_AH_AL 10 +#else +#define MAX_AH_AL 13 +#endif + if (Ss < 0 || Ss >= DCTSIZE2 || Se < Ss || Se >= DCTSIZE2 || + Ah < 0 || Ah > MAX_AH_AL || Al < 0 || Al > MAX_AH_AL) + ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno); + if (Ss == 0) { + if (Se != 0) /* DC and AC together not OK */ + ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno); + } else { + if (ncomps != 1) /* AC scans must be for only one component */ + ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno); + } + for (ci = 0; ci < ncomps; ci++) { + last_bitpos_ptr = & last_bitpos[scanptr->component_index[ci]][0]; + if (Ss != 0 && last_bitpos_ptr[0] < 0) /* AC without prior DC scan */ + ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno); + for (coefi = Ss; coefi <= Se; coefi++) { + if (last_bitpos_ptr[coefi] < 0) { + /* first scan of this coefficient */ + if (Ah != 0) + ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno); + } else { + /* not first scan */ + if (Ah != last_bitpos_ptr[coefi] || Al != Ah-1) + ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno); + } + last_bitpos_ptr[coefi] = Al; + } + } +#endif + } else { + /* For sequential JPEG, all progression parameters must be these: */ + if (Ss != 0 || Se != DCTSIZE2-1 || Ah != 0 || Al != 0) + ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno); + /* Make sure components are not sent twice */ + for (ci = 0; ci < ncomps; ci++) { + thisi = scanptr->component_index[ci]; + if (component_sent[thisi]) + ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno); + component_sent[thisi] = TRUE; + } + } + } + + /* Now verify that everything got sent. */ + if (cinfo->progressive_mode) { +#ifdef C_PROGRESSIVE_SUPPORTED + /* For progressive mode, we only check that at least some DC data + * got sent for each component; the spec does not require that all bits + * of all coefficients be transmitted. Would it be wiser to enforce + * transmission of all coefficient bits?? + */ + for (ci = 0; ci < cinfo->num_components; ci++) { + if (last_bitpos[ci][0] < 0) + ERREXIT(cinfo, JERR_MISSING_DATA); + } +#endif + } else { + for (ci = 0; ci < cinfo->num_components; ci++) { + if (! component_sent[ci]) + ERREXIT(cinfo, JERR_MISSING_DATA); + } + } +} + +#endif /* C_MULTISCAN_FILES_SUPPORTED */ + + +LOCAL(void) +select_scan_parameters (j_compress_ptr cinfo) +/* Set up the scan parameters for the current scan */ +{ + int ci; + +#ifdef C_MULTISCAN_FILES_SUPPORTED + if (cinfo->scan_info != NULL) { + /* Prepare for current scan --- the script is already validated */ + my_master_ptr master = (my_master_ptr) cinfo->master; + const jpeg_scan_info *scanptr = cinfo->scan_info + master->scan_number; + + cinfo->comps_in_scan = scanptr->comps_in_scan; + for (ci = 0; ci < scanptr->comps_in_scan; ci++) { + cinfo->cur_comp_info[ci] = + &cinfo->comp_info[scanptr->component_index[ci]]; + } + cinfo->Ss = scanptr->Ss; + cinfo->Se = scanptr->Se; + cinfo->Ah = scanptr->Ah; + cinfo->Al = scanptr->Al; + } + else +#endif + { + /* Prepare for single sequential-JPEG scan containing all components */ + if (cinfo->num_components > MAX_COMPS_IN_SCAN) + ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->num_components, + MAX_COMPS_IN_SCAN); + cinfo->comps_in_scan = cinfo->num_components; + for (ci = 0; ci < cinfo->num_components; ci++) { + cinfo->cur_comp_info[ci] = &cinfo->comp_info[ci]; + } + cinfo->Ss = 0; + cinfo->Se = DCTSIZE2-1; + cinfo->Ah = 0; + cinfo->Al = 0; + } +} + + +LOCAL(void) +per_scan_setup (j_compress_ptr cinfo) +/* Do computations that are needed before processing a JPEG scan */ +/* cinfo->comps_in_scan and cinfo->cur_comp_info[] are already set */ +{ + int ci, mcublks, tmp; + jpeg_component_info *compptr; + + if (cinfo->comps_in_scan == 1) { + + /* Noninterleaved (single-component) scan */ + compptr = cinfo->cur_comp_info[0]; + + /* Overall image size in MCUs */ + cinfo->MCUs_per_row = compptr->width_in_blocks; + cinfo->MCU_rows_in_scan = compptr->height_in_blocks; + + /* For noninterleaved scan, always one block per MCU */ + compptr->MCU_width = 1; + compptr->MCU_height = 1; + compptr->MCU_blocks = 1; + compptr->MCU_sample_width = DCTSIZE; + compptr->last_col_width = 1; + /* For noninterleaved scans, it is convenient to define last_row_height + * as the number of block rows present in the last iMCU row. + */ + tmp = (int) (compptr->height_in_blocks % compptr->v_samp_factor); + if (tmp == 0) tmp = compptr->v_samp_factor; + compptr->last_row_height = tmp; + + /* Prepare array describing MCU composition */ + cinfo->blocks_in_MCU = 1; + cinfo->MCU_membership[0] = 0; + + } else { + + /* Interleaved (multi-component) scan */ + if (cinfo->comps_in_scan <= 0 || cinfo->comps_in_scan > MAX_COMPS_IN_SCAN) + ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->comps_in_scan, + MAX_COMPS_IN_SCAN); + + /* Overall image size in MCUs */ + cinfo->MCUs_per_row = (JDIMENSION) + jdiv_round_up((long) cinfo->_jpeg_width, + (long) (cinfo->max_h_samp_factor*DCTSIZE)); + cinfo->MCU_rows_in_scan = (JDIMENSION) + jdiv_round_up((long) cinfo->_jpeg_height, + (long) (cinfo->max_v_samp_factor*DCTSIZE)); + + cinfo->blocks_in_MCU = 0; + + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { + compptr = cinfo->cur_comp_info[ci]; + /* Sampling factors give # of blocks of component in each MCU */ + compptr->MCU_width = compptr->h_samp_factor; + compptr->MCU_height = compptr->v_samp_factor; + compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height; + compptr->MCU_sample_width = compptr->MCU_width * DCTSIZE; + /* Figure number of non-dummy blocks in last MCU column & row */ + tmp = (int) (compptr->width_in_blocks % compptr->MCU_width); + if (tmp == 0) tmp = compptr->MCU_width; + compptr->last_col_width = tmp; + tmp = (int) (compptr->height_in_blocks % compptr->MCU_height); + if (tmp == 0) tmp = compptr->MCU_height; + compptr->last_row_height = tmp; + /* Prepare array describing MCU composition */ + mcublks = compptr->MCU_blocks; + if (cinfo->blocks_in_MCU + mcublks > C_MAX_BLOCKS_IN_MCU) + ERREXIT(cinfo, JERR_BAD_MCU_SIZE); + while (mcublks-- > 0) { + cinfo->MCU_membership[cinfo->blocks_in_MCU++] = ci; + } + } + + } + + /* Convert restart specified in rows to actual MCU count. */ + /* Note that count must fit in 16 bits, so we provide limiting. */ + if (cinfo->restart_in_rows > 0) { + long nominal = (long) cinfo->restart_in_rows * (long) cinfo->MCUs_per_row; + cinfo->restart_interval = (unsigned int) MIN(nominal, 65535L); + } +} + + +/* + * Per-pass setup. + * This is called at the beginning of each pass. We determine which modules + * will be active during this pass and give them appropriate start_pass calls. + * We also set is_last_pass to indicate whether any more passes will be + * required. + */ + +METHODDEF(void) +prepare_for_pass (j_compress_ptr cinfo) +{ + my_master_ptr master = (my_master_ptr) cinfo->master; + + switch (master->pass_type) { + case main_pass: + /* Initial pass: will collect input data, and do either Huffman + * optimization or data output for the first scan. + */ + select_scan_parameters(cinfo); + per_scan_setup(cinfo); + if (! cinfo->raw_data_in) { + (*cinfo->cconvert->start_pass) (cinfo); + (*cinfo->downsample->start_pass) (cinfo); + (*cinfo->prep->start_pass) (cinfo, JBUF_PASS_THRU); + } + (*cinfo->fdct->start_pass) (cinfo); + (*cinfo->entropy->start_pass) (cinfo, cinfo->optimize_coding); + (*cinfo->coef->start_pass) (cinfo, + (master->total_passes > 1 ? + JBUF_SAVE_AND_PASS : JBUF_PASS_THRU)); + (*cinfo->main->start_pass) (cinfo, JBUF_PASS_THRU); + if (cinfo->optimize_coding) { + /* No immediate data output; postpone writing frame/scan headers */ + master->pub.call_pass_startup = FALSE; + } else { + /* Will write frame/scan headers at first jpeg_write_scanlines call */ + master->pub.call_pass_startup = TRUE; + } + break; +#ifdef ENTROPY_OPT_SUPPORTED + case huff_opt_pass: + /* Do Huffman optimization for a scan after the first one. */ + select_scan_parameters(cinfo); + per_scan_setup(cinfo); + if (cinfo->Ss != 0 || cinfo->Ah == 0 || cinfo->arith_code) { + (*cinfo->entropy->start_pass) (cinfo, TRUE); + (*cinfo->coef->start_pass) (cinfo, JBUF_CRANK_DEST); + master->pub.call_pass_startup = FALSE; + break; + } + /* Special case: Huffman DC refinement scans need no Huffman table + * and therefore we can skip the optimization pass for them. + */ + master->pass_type = output_pass; + master->pass_number++; + /*FALLTHROUGH*/ +#endif + case output_pass: + /* Do a data-output pass. */ + /* We need not repeat per-scan setup if prior optimization pass did it. */ + if (! cinfo->optimize_coding) { + select_scan_parameters(cinfo); + per_scan_setup(cinfo); + } + (*cinfo->entropy->start_pass) (cinfo, FALSE); + (*cinfo->coef->start_pass) (cinfo, JBUF_CRANK_DEST); + /* We emit frame/scan headers now */ + if (master->scan_number == 0) + (*cinfo->marker->write_frame_header) (cinfo); + (*cinfo->marker->write_scan_header) (cinfo); + master->pub.call_pass_startup = FALSE; + break; + default: + ERREXIT(cinfo, JERR_NOT_COMPILED); + } + + master->pub.is_last_pass = (master->pass_number == master->total_passes-1); + + /* Set up progress monitor's pass info if present */ + if (cinfo->progress != NULL) { + cinfo->progress->completed_passes = master->pass_number; + cinfo->progress->total_passes = master->total_passes; + } +} + + +/* + * Special start-of-pass hook. + * This is called by jpeg_write_scanlines if call_pass_startup is TRUE. + * In single-pass processing, we need this hook because we don't want to + * write frame/scan headers during jpeg_start_compress; we want to let the + * application write COM markers etc. between jpeg_start_compress and the + * jpeg_write_scanlines loop. + * In multi-pass processing, this routine is not used. + */ + +METHODDEF(void) +pass_startup (j_compress_ptr cinfo) +{ + cinfo->master->call_pass_startup = FALSE; /* reset flag so call only once */ + + (*cinfo->marker->write_frame_header) (cinfo); + (*cinfo->marker->write_scan_header) (cinfo); +} + + +/* + * Finish up at end of pass. + */ + +METHODDEF(void) +finish_pass_master (j_compress_ptr cinfo) +{ + my_master_ptr master = (my_master_ptr) cinfo->master; + + /* The entropy coder always needs an end-of-pass call, + * either to analyze statistics or to flush its output buffer. + */ + (*cinfo->entropy->finish_pass) (cinfo); + + /* Update state for next pass */ + switch (master->pass_type) { + case main_pass: + /* next pass is either output of scan 0 (after optimization) + * or output of scan 1 (if no optimization). + */ + master->pass_type = output_pass; + if (! cinfo->optimize_coding) + master->scan_number++; + break; + case huff_opt_pass: + /* next pass is always output of current scan */ + master->pass_type = output_pass; + break; + case output_pass: + /* next pass is either optimization or output of next scan */ + if (cinfo->optimize_coding) + master->pass_type = huff_opt_pass; + master->scan_number++; + break; + } + + master->pass_number++; +} + + +/* + * Initialize master compression control. + */ + +GLOBAL(void) +jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only) +{ + my_master_ptr master; + + master = (my_master_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(my_comp_master)); + cinfo->master = (struct jpeg_comp_master *) master; + master->pub.prepare_for_pass = prepare_for_pass; + master->pub.pass_startup = pass_startup; + master->pub.finish_pass = finish_pass_master; + master->pub.is_last_pass = FALSE; + + /* Validate parameters, determine derived values */ + initial_setup(cinfo, transcode_only); + + if (cinfo->scan_info != NULL) { +#ifdef C_MULTISCAN_FILES_SUPPORTED + validate_script(cinfo); +#else + ERREXIT(cinfo, JERR_NOT_COMPILED); +#endif + } else { + cinfo->progressive_mode = FALSE; + cinfo->num_scans = 1; + } + + if (cinfo->progressive_mode && !cinfo->arith_code) /* TEMPORARY HACK ??? */ + cinfo->optimize_coding = TRUE; /* assume default tables no good for progressive mode */ + + /* Initialize my private state */ + if (transcode_only) { + /* no main pass in transcoding */ + if (cinfo->optimize_coding) + master->pass_type = huff_opt_pass; + else + master->pass_type = output_pass; + } else { + /* for normal compression, first pass is always this type: */ + master->pass_type = main_pass; + } + master->scan_number = 0; + master->pass_number = 0; + if (cinfo->optimize_coding) + master->total_passes = cinfo->num_scans * 2; + else + master->total_passes = cinfo->num_scans; + + master->jpeg_version = PACKAGE_NAME " version " VERSION " (build " BUILD ")"; +} diff --git a/media/libjpeg/jcomapi.c b/media/libjpeg/jcomapi.c new file mode 100644 index 000000000..6e5bf3dba --- /dev/null +++ b/media/libjpeg/jcomapi.c @@ -0,0 +1,109 @@ +/* + * jcomapi.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1994-1997, Thomas G. Lane. + * It was modified by The libjpeg-turbo Project to include only code relevant + * to libjpeg-turbo. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains application interface routines that are used for both + * compression and decompression. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" + + +/* + * Abort processing of a JPEG compression or decompression operation, + * but don't destroy the object itself. + * + * For this, we merely clean up all the nonpermanent memory pools. + * Note that temp files (virtual arrays) are not allowed to belong to + * the permanent pool, so we will be able to close all temp files here. + * Closing a data source or destination, if necessary, is the application's + * responsibility. + */ + +GLOBAL(void) +jpeg_abort (j_common_ptr cinfo) +{ + int pool; + + /* Do nothing if called on a not-initialized or destroyed JPEG object. */ + if (cinfo->mem == NULL) + return; + + /* Releasing pools in reverse order might help avoid fragmentation + * with some (brain-damaged) malloc libraries. + */ + for (pool = JPOOL_NUMPOOLS-1; pool > JPOOL_PERMANENT; pool--) { + (*cinfo->mem->free_pool) (cinfo, pool); + } + + /* Reset overall state for possible reuse of object */ + if (cinfo->is_decompressor) { + cinfo->global_state = DSTATE_START; + /* Try to keep application from accessing now-deleted marker list. + * A bit kludgy to do it here, but this is the most central place. + */ + ((j_decompress_ptr) cinfo)->marker_list = NULL; + } else { + cinfo->global_state = CSTATE_START; + } +} + + +/* + * Destruction of a JPEG object. + * + * Everything gets deallocated except the master jpeg_compress_struct itself + * and the error manager struct. Both of these are supplied by the application + * and must be freed, if necessary, by the application. (Often they are on + * the stack and so don't need to be freed anyway.) + * Closing a data source or destination, if necessary, is the application's + * responsibility. + */ + +GLOBAL(void) +jpeg_destroy (j_common_ptr cinfo) +{ + /* We need only tell the memory manager to release everything. */ + /* NB: mem pointer is NULL if memory mgr failed to initialize. */ + if (cinfo->mem != NULL) + (*cinfo->mem->self_destruct) (cinfo); + cinfo->mem = NULL; /* be safe if jpeg_destroy is called twice */ + cinfo->global_state = 0; /* mark it destroyed */ +} + + +/* + * Convenience routines for allocating quantization and Huffman tables. + * (Would jutils.c be a more reasonable place to put these?) + */ + +GLOBAL(JQUANT_TBL *) +jpeg_alloc_quant_table (j_common_ptr cinfo) +{ + JQUANT_TBL *tbl; + + tbl = (JQUANT_TBL *) + (*cinfo->mem->alloc_small) (cinfo, JPOOL_PERMANENT, sizeof(JQUANT_TBL)); + tbl->sent_table = FALSE; /* make sure this is false in any new table */ + return tbl; +} + + +GLOBAL(JHUFF_TBL *) +jpeg_alloc_huff_table (j_common_ptr cinfo) +{ + JHUFF_TBL *tbl; + + tbl = (JHUFF_TBL *) + (*cinfo->mem->alloc_small) (cinfo, JPOOL_PERMANENT, sizeof(JHUFF_TBL)); + tbl->sent_table = FALSE; /* make sure this is false in any new table */ + return tbl; +} diff --git a/media/libjpeg/jconfig.h b/media/libjpeg/jconfig.h new file mode 100644 index 000000000..e882bbd30 --- /dev/null +++ b/media/libjpeg/jconfig.h @@ -0,0 +1,75 @@ +/* jconfig.h. Generated from jconfig.h.in by configure, then manually edited + for Mozilla. */ + +/* Export libjpeg v6.2's ABI. */ +#define JPEG_LIB_VERSION 62 + +/* libjpeg-turbo version */ +#define LIBJPEG_TURBO_VERSION 1.5.1 + +/* Support arithmetic encoding */ +/*#undef C_ARITH_CODING_SUPPORTED */ + +/* Support arithmetic decoding */ +/*#undef D_ARITH_CODING_SUPPORTED */ + +/* + * Define BITS_IN_JSAMPLE as either + * 8 for 8-bit sample values (the usual setting) + * 12 for 12-bit sample values + * Only 8 and 12 are legal data precisions for lossy JPEG according to the + * JPEG standard, and the IJG code does not support anything else! + * We do not support run-time selection of data precision, sorry. + */ +#define BITS_IN_JSAMPLE 8 /* use 8 or 12 */ + +/* Define to 1 if you have the <locale.h> header file. */ +/*#undef HAVE_LOCALE_H */ + +/* Define to 1 if you have the <stddef.h> header file. */ +#define HAVE_STDDEF_H 1 + +/* Define to 1 if you have the <stdlib.h> header file. */ +#define HAVE_STDLIB_H 1 + +/* Compiler supports 'unsigned char'. */ +#define HAVE_UNSIGNED_CHAR 1 + +/* Compiler supports 'unsigned short'. */ +#define HAVE_UNSIGNED_SHORT 1 + +/* Compiler does not support pointers to unspecified structures. */ +/* #define INCOMPLETE_TYPES_BROKEN 1 */ + +/* Support in-memory source/destination managers */ +/* #undef MEM_SRCDST_SUPPORTED */ + +/* Compiler has <strings.h> rather than standard <string.h>. */ +/* #undef NEED_BSD_STRINGS */ + +/* Need to include <sys/types.h> in order to obtain size_t. */ +#define NEED_SYS_TYPES_H 1 + +/* Broken compiler shifts signed values as an unsigned shift. */ +/* #undef RIGHT_SHIFT_IS_UNSIGNED */ + +/* Use accelerated SIMD routines. */ +#define WITH_SIMD 1 + +/* Define to 1 if type `char' is unsigned and you are not using gcc. */ +#ifndef __CHAR_UNSIGNED__ +/* # undef __CHAR_UNSIGNED__ */ +#endif + +/* Define to empty if `const' does not conform to ANSI C. */ +/* #undef const */ + +/* Define to `unsigned int' if <sys/types.h> does not define. */ +/* #undef size_t */ + +/* The size of `size_t', as computed by sizeof. */ +#ifdef HAVE_64BIT_BUILD +#define SIZEOF_SIZE_T 8 +#else +#define SIZEOF_SIZE_T 4 +#endif diff --git a/media/libjpeg/jconfigint.h b/media/libjpeg/jconfigint.h new file mode 100644 index 000000000..1289399f9 --- /dev/null +++ b/media/libjpeg/jconfigint.h @@ -0,0 +1,7 @@ +#define VERSION "1.5.1" +#define BUILD "2016-09-20" +#define PACKAGE_NAME "libjpeg-turbo" + +/* Need to use Mozilla-specific function inlining. */ +#include "mozilla/Attributes.h" +#define INLINE MOZ_ALWAYS_INLINE diff --git a/media/libjpeg/jcparam.c b/media/libjpeg/jcparam.c new file mode 100644 index 000000000..18b2d487a --- /dev/null +++ b/media/libjpeg/jcparam.c @@ -0,0 +1,542 @@ +/* + * jcparam.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1998, Thomas G. Lane. + * Modified 2003-2008 by Guido Vollbeding. + * libjpeg-turbo Modifications: + * Copyright (C) 2009-2011, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains optional default-setting code for the JPEG compressor. + * Applications do not have to use this file, but those that don't use it + * must know a lot more about the innards of the JPEG code. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jstdhuff.c" + + +/* + * Quantization table setup routines + */ + +GLOBAL(void) +jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl, + const unsigned int *basic_table, + int scale_factor, boolean force_baseline) +/* Define a quantization table equal to the basic_table times + * a scale factor (given as a percentage). + * If force_baseline is TRUE, the computed quantization table entries + * are limited to 1..255 for JPEG baseline compatibility. + */ +{ + JQUANT_TBL **qtblptr; + int i; + long temp; + + /* Safety check to ensure start_compress not called yet. */ + if (cinfo->global_state != CSTATE_START) + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + + if (which_tbl < 0 || which_tbl >= NUM_QUANT_TBLS) + ERREXIT1(cinfo, JERR_DQT_INDEX, which_tbl); + + qtblptr = & cinfo->quant_tbl_ptrs[which_tbl]; + + if (*qtblptr == NULL) + *qtblptr = jpeg_alloc_quant_table((j_common_ptr) cinfo); + + for (i = 0; i < DCTSIZE2; i++) { + temp = ((long) basic_table[i] * scale_factor + 50L) / 100L; + /* limit the values to the valid range */ + if (temp <= 0L) temp = 1L; + if (temp > 32767L) temp = 32767L; /* max quantizer needed for 12 bits */ + if (force_baseline && temp > 255L) + temp = 255L; /* limit to baseline range if requested */ + (*qtblptr)->quantval[i] = (UINT16) temp; + } + + /* Initialize sent_table FALSE so table will be written to JPEG file. */ + (*qtblptr)->sent_table = FALSE; +} + + +/* These are the sample quantization tables given in JPEG spec section K.1. + * The spec says that the values given produce "good" quality, and + * when divided by 2, "very good" quality. + */ +static const unsigned int std_luminance_quant_tbl[DCTSIZE2] = { + 16, 11, 10, 16, 24, 40, 51, 61, + 12, 12, 14, 19, 26, 58, 60, 55, + 14, 13, 16, 24, 40, 57, 69, 56, + 14, 17, 22, 29, 51, 87, 80, 62, + 18, 22, 37, 56, 68, 109, 103, 77, + 24, 35, 55, 64, 81, 104, 113, 92, + 49, 64, 78, 87, 103, 121, 120, 101, + 72, 92, 95, 98, 112, 100, 103, 99 +}; +static const unsigned int std_chrominance_quant_tbl[DCTSIZE2] = { + 17, 18, 24, 47, 99, 99, 99, 99, + 18, 21, 26, 66, 99, 99, 99, 99, + 24, 26, 56, 99, 99, 99, 99, 99, + 47, 66, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99 +}; + + +#if JPEG_LIB_VERSION >= 70 +GLOBAL(void) +jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline) +/* Set or change the 'quality' (quantization) setting, using default tables + * and straight percentage-scaling quality scales. + * This entry point allows different scalings for luminance and chrominance. + */ +{ + /* Set up two quantization tables using the specified scaling */ + jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl, + cinfo->q_scale_factor[0], force_baseline); + jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl, + cinfo->q_scale_factor[1], force_baseline); +} +#endif + + +GLOBAL(void) +jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor, + boolean force_baseline) +/* Set or change the 'quality' (quantization) setting, using default tables + * and a straight percentage-scaling quality scale. In most cases it's better + * to use jpeg_set_quality (below); this entry point is provided for + * applications that insist on a linear percentage scaling. + */ +{ + /* Set up two quantization tables using the specified scaling */ + jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl, + scale_factor, force_baseline); + jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl, + scale_factor, force_baseline); +} + + +GLOBAL(int) +jpeg_quality_scaling (int quality) +/* Convert a user-specified quality rating to a percentage scaling factor + * for an underlying quantization table, using our recommended scaling curve. + * The input 'quality' factor should be 0 (terrible) to 100 (very good). + */ +{ + /* Safety limit on quality factor. Convert 0 to 1 to avoid zero divide. */ + if (quality <= 0) quality = 1; + if (quality > 100) quality = 100; + + /* The basic table is used as-is (scaling 100) for a quality of 50. + * Qualities 50..100 are converted to scaling percentage 200 - 2*Q; + * note that at Q=100 the scaling is 0, which will cause jpeg_add_quant_table + * to make all the table entries 1 (hence, minimum quantization loss). + * Qualities 1..50 are converted to scaling percentage 5000/Q. + */ + if (quality < 50) + quality = 5000 / quality; + else + quality = 200 - quality*2; + + return quality; +} + + +GLOBAL(void) +jpeg_set_quality (j_compress_ptr cinfo, int quality, boolean force_baseline) +/* Set or change the 'quality' (quantization) setting, using default tables. + * This is the standard quality-adjusting entry point for typical user + * interfaces; only those who want detailed control over quantization tables + * would use the preceding three routines directly. + */ +{ + /* Convert user 0-100 rating to percentage scaling */ + quality = jpeg_quality_scaling(quality); + + /* Set up standard quality tables */ + jpeg_set_linear_quality(cinfo, quality, force_baseline); +} + + +/* + * Default parameter setup for compression. + * + * Applications that don't choose to use this routine must do their + * own setup of all these parameters. Alternately, you can call this + * to establish defaults and then alter parameters selectively. This + * is the recommended approach since, if we add any new parameters, + * your code will still work (they'll be set to reasonable defaults). + */ + +GLOBAL(void) +jpeg_set_defaults (j_compress_ptr cinfo) +{ + int i; + + /* Safety check to ensure start_compress not called yet. */ + if (cinfo->global_state != CSTATE_START) + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + + /* Allocate comp_info array large enough for maximum component count. + * Array is made permanent in case application wants to compress + * multiple images at same param settings. + */ + if (cinfo->comp_info == NULL) + cinfo->comp_info = (jpeg_component_info *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT, + MAX_COMPONENTS * sizeof(jpeg_component_info)); + + /* Initialize everything not dependent on the color space */ + +#if JPEG_LIB_VERSION >= 70 + cinfo->scale_num = 1; /* 1:1 scaling */ + cinfo->scale_denom = 1; +#endif + cinfo->data_precision = BITS_IN_JSAMPLE; + /* Set up two quantization tables using default quality of 75 */ + jpeg_set_quality(cinfo, 75, TRUE); + /* Set up two Huffman tables */ + std_huff_tables((j_common_ptr) cinfo); + + /* Initialize default arithmetic coding conditioning */ + for (i = 0; i < NUM_ARITH_TBLS; i++) { + cinfo->arith_dc_L[i] = 0; + cinfo->arith_dc_U[i] = 1; + cinfo->arith_ac_K[i] = 5; + } + + /* Default is no multiple-scan output */ + cinfo->scan_info = NULL; + cinfo->num_scans = 0; + + /* Expect normal source image, not raw downsampled data */ + cinfo->raw_data_in = FALSE; + + /* Use Huffman coding, not arithmetic coding, by default */ + cinfo->arith_code = FALSE; + + /* By default, don't do extra passes to optimize entropy coding */ + cinfo->optimize_coding = FALSE; + /* The standard Huffman tables are only valid for 8-bit data precision. + * If the precision is higher, force optimization on so that usable + * tables will be computed. This test can be removed if default tables + * are supplied that are valid for the desired precision. + */ + if (cinfo->data_precision > 8) + cinfo->optimize_coding = TRUE; + + /* By default, use the simpler non-cosited sampling alignment */ + cinfo->CCIR601_sampling = FALSE; + +#if JPEG_LIB_VERSION >= 70 + /* By default, apply fancy downsampling */ + cinfo->do_fancy_downsampling = TRUE; +#endif + + /* No input smoothing */ + cinfo->smoothing_factor = 0; + + /* DCT algorithm preference */ + cinfo->dct_method = JDCT_DEFAULT; + + /* No restart markers */ + cinfo->restart_interval = 0; + cinfo->restart_in_rows = 0; + + /* Fill in default JFIF marker parameters. Note that whether the marker + * will actually be written is determined by jpeg_set_colorspace. + * + * By default, the library emits JFIF version code 1.01. + * An application that wants to emit JFIF 1.02 extension markers should set + * JFIF_minor_version to 2. We could probably get away with just defaulting + * to 1.02, but there may still be some decoders in use that will complain + * about that; saying 1.01 should minimize compatibility problems. + */ + cinfo->JFIF_major_version = 1; /* Default JFIF version = 1.01 */ + cinfo->JFIF_minor_version = 1; + cinfo->density_unit = 0; /* Pixel size is unknown by default */ + cinfo->X_density = 1; /* Pixel aspect ratio is square by default */ + cinfo->Y_density = 1; + + /* Choose JPEG colorspace based on input space, set defaults accordingly */ + + jpeg_default_colorspace(cinfo); +} + + +/* + * Select an appropriate JPEG colorspace for in_color_space. + */ + +GLOBAL(void) +jpeg_default_colorspace (j_compress_ptr cinfo) +{ + switch (cinfo->in_color_space) { + case JCS_GRAYSCALE: + jpeg_set_colorspace(cinfo, JCS_GRAYSCALE); + break; + case JCS_RGB: + case JCS_EXT_RGB: + case JCS_EXT_RGBX: + case JCS_EXT_BGR: + case JCS_EXT_BGRX: + case JCS_EXT_XBGR: + case JCS_EXT_XRGB: + case JCS_EXT_RGBA: + case JCS_EXT_BGRA: + case JCS_EXT_ABGR: + case JCS_EXT_ARGB: + jpeg_set_colorspace(cinfo, JCS_YCbCr); + break; + case JCS_YCbCr: + jpeg_set_colorspace(cinfo, JCS_YCbCr); + break; + case JCS_CMYK: + jpeg_set_colorspace(cinfo, JCS_CMYK); /* By default, no translation */ + break; + case JCS_YCCK: + jpeg_set_colorspace(cinfo, JCS_YCCK); + break; + case JCS_UNKNOWN: + jpeg_set_colorspace(cinfo, JCS_UNKNOWN); + break; + default: + ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE); + } +} + + +/* + * Set the JPEG colorspace, and choose colorspace-dependent default values. + */ + +GLOBAL(void) +jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace) +{ + jpeg_component_info *compptr; + int ci; + +#define SET_COMP(index,id,hsamp,vsamp,quant,dctbl,actbl) \ + (compptr = &cinfo->comp_info[index], \ + compptr->component_id = (id), \ + compptr->h_samp_factor = (hsamp), \ + compptr->v_samp_factor = (vsamp), \ + compptr->quant_tbl_no = (quant), \ + compptr->dc_tbl_no = (dctbl), \ + compptr->ac_tbl_no = (actbl) ) + + /* Safety check to ensure start_compress not called yet. */ + if (cinfo->global_state != CSTATE_START) + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + + /* For all colorspaces, we use Q and Huff tables 0 for luminance components, + * tables 1 for chrominance components. + */ + + cinfo->jpeg_color_space = colorspace; + + cinfo->write_JFIF_header = FALSE; /* No marker for non-JFIF colorspaces */ + cinfo->write_Adobe_marker = FALSE; /* write no Adobe marker by default */ + + switch (colorspace) { + case JCS_GRAYSCALE: + cinfo->write_JFIF_header = TRUE; /* Write a JFIF marker */ + cinfo->num_components = 1; + /* JFIF specifies component ID 1 */ + SET_COMP(0, 1, 1,1, 0, 0,0); + break; + case JCS_RGB: + cinfo->write_Adobe_marker = TRUE; /* write Adobe marker to flag RGB */ + cinfo->num_components = 3; + SET_COMP(0, 0x52 /* 'R' */, 1,1, 0, 0,0); + SET_COMP(1, 0x47 /* 'G' */, 1,1, 0, 0,0); + SET_COMP(2, 0x42 /* 'B' */, 1,1, 0, 0,0); + break; + case JCS_YCbCr: + cinfo->write_JFIF_header = TRUE; /* Write a JFIF marker */ + cinfo->num_components = 3; + /* JFIF specifies component IDs 1,2,3 */ + /* We default to 2x2 subsamples of chrominance */ + SET_COMP(0, 1, 2,2, 0, 0,0); + SET_COMP(1, 2, 1,1, 1, 1,1); + SET_COMP(2, 3, 1,1, 1, 1,1); + break; + case JCS_CMYK: + cinfo->write_Adobe_marker = TRUE; /* write Adobe marker to flag CMYK */ + cinfo->num_components = 4; + SET_COMP(0, 0x43 /* 'C' */, 1,1, 0, 0,0); + SET_COMP(1, 0x4D /* 'M' */, 1,1, 0, 0,0); + SET_COMP(2, 0x59 /* 'Y' */, 1,1, 0, 0,0); + SET_COMP(3, 0x4B /* 'K' */, 1,1, 0, 0,0); + break; + case JCS_YCCK: + cinfo->write_Adobe_marker = TRUE; /* write Adobe marker to flag YCCK */ + cinfo->num_components = 4; + SET_COMP(0, 1, 2,2, 0, 0,0); + SET_COMP(1, 2, 1,1, 1, 1,1); + SET_COMP(2, 3, 1,1, 1, 1,1); + SET_COMP(3, 4, 2,2, 0, 0,0); + break; + case JCS_UNKNOWN: + cinfo->num_components = cinfo->input_components; + if (cinfo->num_components < 1 || cinfo->num_components > MAX_COMPONENTS) + ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->num_components, + MAX_COMPONENTS); + for (ci = 0; ci < cinfo->num_components; ci++) { + SET_COMP(ci, ci, 1,1, 0, 0,0); + } + break; + default: + ERREXIT(cinfo, JERR_BAD_J_COLORSPACE); + } +} + + +#ifdef C_PROGRESSIVE_SUPPORTED + +LOCAL(jpeg_scan_info *) +fill_a_scan (jpeg_scan_info *scanptr, int ci, + int Ss, int Se, int Ah, int Al) +/* Support routine: generate one scan for specified component */ +{ + scanptr->comps_in_scan = 1; + scanptr->component_index[0] = ci; + scanptr->Ss = Ss; + scanptr->Se = Se; + scanptr->Ah = Ah; + scanptr->Al = Al; + scanptr++; + return scanptr; +} + +LOCAL(jpeg_scan_info *) +fill_scans (jpeg_scan_info *scanptr, int ncomps, + int Ss, int Se, int Ah, int Al) +/* Support routine: generate one scan for each component */ +{ + int ci; + + for (ci = 0; ci < ncomps; ci++) { + scanptr->comps_in_scan = 1; + scanptr->component_index[0] = ci; + scanptr->Ss = Ss; + scanptr->Se = Se; + scanptr->Ah = Ah; + scanptr->Al = Al; + scanptr++; + } + return scanptr; +} + +LOCAL(jpeg_scan_info *) +fill_dc_scans (jpeg_scan_info *scanptr, int ncomps, int Ah, int Al) +/* Support routine: generate interleaved DC scan if possible, else N scans */ +{ + int ci; + + if (ncomps <= MAX_COMPS_IN_SCAN) { + /* Single interleaved DC scan */ + scanptr->comps_in_scan = ncomps; + for (ci = 0; ci < ncomps; ci++) + scanptr->component_index[ci] = ci; + scanptr->Ss = scanptr->Se = 0; + scanptr->Ah = Ah; + scanptr->Al = Al; + scanptr++; + } else { + /* Noninterleaved DC scan for each component */ + scanptr = fill_scans(scanptr, ncomps, 0, 0, Ah, Al); + } + return scanptr; +} + + +/* + * Create a recommended progressive-JPEG script. + * cinfo->num_components and cinfo->jpeg_color_space must be correct. + */ + +GLOBAL(void) +jpeg_simple_progression (j_compress_ptr cinfo) +{ + int ncomps = cinfo->num_components; + int nscans; + jpeg_scan_info *scanptr; + + /* Safety check to ensure start_compress not called yet. */ + if (cinfo->global_state != CSTATE_START) + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + + /* Figure space needed for script. Calculation must match code below! */ + if (ncomps == 3 && cinfo->jpeg_color_space == JCS_YCbCr) { + /* Custom script for YCbCr color images. */ + nscans = 10; + } else { + /* All-purpose script for other color spaces. */ + if (ncomps > MAX_COMPS_IN_SCAN) + nscans = 6 * ncomps; /* 2 DC + 4 AC scans per component */ + else + nscans = 2 + 4 * ncomps; /* 2 DC scans; 4 AC scans per component */ + } + + /* Allocate space for script. + * We need to put it in the permanent pool in case the application performs + * multiple compressions without changing the settings. To avoid a memory + * leak if jpeg_simple_progression is called repeatedly for the same JPEG + * object, we try to re-use previously allocated space, and we allocate + * enough space to handle YCbCr even if initially asked for grayscale. + */ + if (cinfo->script_space == NULL || cinfo->script_space_size < nscans) { + cinfo->script_space_size = MAX(nscans, 10); + cinfo->script_space = (jpeg_scan_info *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT, + cinfo->script_space_size * sizeof(jpeg_scan_info)); + } + scanptr = cinfo->script_space; + cinfo->scan_info = scanptr; + cinfo->num_scans = nscans; + + if (ncomps == 3 && cinfo->jpeg_color_space == JCS_YCbCr) { + /* Custom script for YCbCr color images. */ + /* Initial DC scan */ + scanptr = fill_dc_scans(scanptr, ncomps, 0, 1); + /* Initial AC scan: get some luma data out in a hurry */ + scanptr = fill_a_scan(scanptr, 0, 1, 5, 0, 2); + /* Chroma data is too small to be worth expending many scans on */ + scanptr = fill_a_scan(scanptr, 2, 1, 63, 0, 1); + scanptr = fill_a_scan(scanptr, 1, 1, 63, 0, 1); + /* Complete spectral selection for luma AC */ + scanptr = fill_a_scan(scanptr, 0, 6, 63, 0, 2); + /* Refine next bit of luma AC */ + scanptr = fill_a_scan(scanptr, 0, 1, 63, 2, 1); + /* Finish DC successive approximation */ + scanptr = fill_dc_scans(scanptr, ncomps, 1, 0); + /* Finish AC successive approximation */ + scanptr = fill_a_scan(scanptr, 2, 1, 63, 1, 0); + scanptr = fill_a_scan(scanptr, 1, 1, 63, 1, 0); + /* Luma bottom bit comes last since it's usually largest scan */ + scanptr = fill_a_scan(scanptr, 0, 1, 63, 1, 0); + } else { + /* All-purpose script for other color spaces. */ + /* Successive approximation first pass */ + scanptr = fill_dc_scans(scanptr, ncomps, 0, 1); + scanptr = fill_scans(scanptr, ncomps, 1, 5, 0, 2); + scanptr = fill_scans(scanptr, ncomps, 6, 63, 0, 2); + /* Successive approximation second pass */ + scanptr = fill_scans(scanptr, ncomps, 1, 63, 2, 1); + /* Successive approximation final pass */ + scanptr = fill_dc_scans(scanptr, ncomps, 1, 0); + scanptr = fill_scans(scanptr, ncomps, 1, 63, 1, 0); + } +} + +#endif /* C_PROGRESSIVE_SUPPORTED */ diff --git a/media/libjpeg/jcphuff.c b/media/libjpeg/jcphuff.c new file mode 100644 index 000000000..046e2e18d --- /dev/null +++ b/media/libjpeg/jcphuff.c @@ -0,0 +1,834 @@ +/* + * jcphuff.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1995-1997, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2015, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains Huffman entropy encoding routines for progressive JPEG. + * + * We do not support output suspension in this module, since the library + * currently does not allow multiple-scan files to be written with output + * suspension. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jchuff.h" /* Declarations shared with jchuff.c */ + +#ifdef C_PROGRESSIVE_SUPPORTED + +/* Expanded entropy encoder object for progressive Huffman encoding. */ + +typedef struct { + struct jpeg_entropy_encoder pub; /* public fields */ + + /* Mode flag: TRUE for optimization, FALSE for actual data output */ + boolean gather_statistics; + + /* Bit-level coding status. + * next_output_byte/free_in_buffer are local copies of cinfo->dest fields. + */ + JOCTET *next_output_byte; /* => next byte to write in buffer */ + size_t free_in_buffer; /* # of byte spaces remaining in buffer */ + size_t put_buffer; /* current bit-accumulation buffer */ + int put_bits; /* # of bits now in it */ + j_compress_ptr cinfo; /* link to cinfo (needed for dump_buffer) */ + + /* Coding status for DC components */ + int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */ + + /* Coding status for AC components */ + int ac_tbl_no; /* the table number of the single component */ + unsigned int EOBRUN; /* run length of EOBs */ + unsigned int BE; /* # of buffered correction bits before MCU */ + char *bit_buffer; /* buffer for correction bits (1 per char) */ + /* packing correction bits tightly would save some space but cost time... */ + + unsigned int restarts_to_go; /* MCUs left in this restart interval */ + int next_restart_num; /* next restart number to write (0-7) */ + + /* Pointers to derived tables (these workspaces have image lifespan). + * Since any one scan codes only DC or only AC, we only need one set + * of tables, not one for DC and one for AC. + */ + c_derived_tbl *derived_tbls[NUM_HUFF_TBLS]; + + /* Statistics tables for optimization; again, one set is enough */ + long *count_ptrs[NUM_HUFF_TBLS]; +} phuff_entropy_encoder; + +typedef phuff_entropy_encoder *phuff_entropy_ptr; + +/* MAX_CORR_BITS is the number of bits the AC refinement correction-bit + * buffer can hold. Larger sizes may slightly improve compression, but + * 1000 is already well into the realm of overkill. + * The minimum safe size is 64 bits. + */ + +#define MAX_CORR_BITS 1000 /* Max # of correction bits I can buffer */ + +/* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than JLONG. + * We assume that int right shift is unsigned if JLONG right shift is, + * which should be safe. + */ + +#ifdef RIGHT_SHIFT_IS_UNSIGNED +#define ISHIFT_TEMPS int ishift_temp; +#define IRIGHT_SHIFT(x,shft) \ + ((ishift_temp = (x)) < 0 ? \ + (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \ + (ishift_temp >> (shft))) +#else +#define ISHIFT_TEMPS +#define IRIGHT_SHIFT(x,shft) ((x) >> (shft)) +#endif + +/* Forward declarations */ +METHODDEF(boolean) encode_mcu_DC_first (j_compress_ptr cinfo, + JBLOCKROW *MCU_data); +METHODDEF(boolean) encode_mcu_AC_first (j_compress_ptr cinfo, + JBLOCKROW *MCU_data); +METHODDEF(boolean) encode_mcu_DC_refine (j_compress_ptr cinfo, + JBLOCKROW *MCU_data); +METHODDEF(boolean) encode_mcu_AC_refine (j_compress_ptr cinfo, + JBLOCKROW *MCU_data); +METHODDEF(void) finish_pass_phuff (j_compress_ptr cinfo); +METHODDEF(void) finish_pass_gather_phuff (j_compress_ptr cinfo); + + +/* + * Initialize for a Huffman-compressed scan using progressive JPEG. + */ + +METHODDEF(void) +start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics) +{ + phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; + boolean is_DC_band; + int ci, tbl; + jpeg_component_info *compptr; + + entropy->cinfo = cinfo; + entropy->gather_statistics = gather_statistics; + + is_DC_band = (cinfo->Ss == 0); + + /* We assume jcmaster.c already validated the scan parameters. */ + + /* Select execution routines */ + if (cinfo->Ah == 0) { + if (is_DC_band) + entropy->pub.encode_mcu = encode_mcu_DC_first; + else + entropy->pub.encode_mcu = encode_mcu_AC_first; + } else { + if (is_DC_band) + entropy->pub.encode_mcu = encode_mcu_DC_refine; + else { + entropy->pub.encode_mcu = encode_mcu_AC_refine; + /* AC refinement needs a correction bit buffer */ + if (entropy->bit_buffer == NULL) + entropy->bit_buffer = (char *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + MAX_CORR_BITS * sizeof(char)); + } + } + if (gather_statistics) + entropy->pub.finish_pass = finish_pass_gather_phuff; + else + entropy->pub.finish_pass = finish_pass_phuff; + + /* Only DC coefficients may be interleaved, so cinfo->comps_in_scan = 1 + * for AC coefficients. + */ + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { + compptr = cinfo->cur_comp_info[ci]; + /* Initialize DC predictions to 0 */ + entropy->last_dc_val[ci] = 0; + /* Get table index */ + if (is_DC_band) { + if (cinfo->Ah != 0) /* DC refinement needs no table */ + continue; + tbl = compptr->dc_tbl_no; + } else { + entropy->ac_tbl_no = tbl = compptr->ac_tbl_no; + } + if (gather_statistics) { + /* Check for invalid table index */ + /* (make_c_derived_tbl does this in the other path) */ + if (tbl < 0 || tbl >= NUM_HUFF_TBLS) + ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, tbl); + /* Allocate and zero the statistics tables */ + /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */ + if (entropy->count_ptrs[tbl] == NULL) + entropy->count_ptrs[tbl] = (long *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + 257 * sizeof(long)); + MEMZERO(entropy->count_ptrs[tbl], 257 * sizeof(long)); + } else { + /* Compute derived values for Huffman table */ + /* We may do this more than once for a table, but it's not expensive */ + jpeg_make_c_derived_tbl(cinfo, is_DC_band, tbl, + & entropy->derived_tbls[tbl]); + } + } + + /* Initialize AC stuff */ + entropy->EOBRUN = 0; + entropy->BE = 0; + + /* Initialize bit buffer to empty */ + entropy->put_buffer = 0; + entropy->put_bits = 0; + + /* Initialize restart stuff */ + entropy->restarts_to_go = cinfo->restart_interval; + entropy->next_restart_num = 0; +} + + +/* Outputting bytes to the file. + * NB: these must be called only when actually outputting, + * that is, entropy->gather_statistics == FALSE. + */ + +/* Emit a byte */ +#define emit_byte(entropy,val) \ + { *(entropy)->next_output_byte++ = (JOCTET) (val); \ + if (--(entropy)->free_in_buffer == 0) \ + dump_buffer(entropy); } + + +LOCAL(void) +dump_buffer (phuff_entropy_ptr entropy) +/* Empty the output buffer; we do not support suspension in this module. */ +{ + struct jpeg_destination_mgr *dest = entropy->cinfo->dest; + + if (! (*dest->empty_output_buffer) (entropy->cinfo)) + ERREXIT(entropy->cinfo, JERR_CANT_SUSPEND); + /* After a successful buffer dump, must reset buffer pointers */ + entropy->next_output_byte = dest->next_output_byte; + entropy->free_in_buffer = dest->free_in_buffer; +} + + +/* Outputting bits to the file */ + +/* Only the right 24 bits of put_buffer are used; the valid bits are + * left-justified in this part. At most 16 bits can be passed to emit_bits + * in one call, and we never retain more than 7 bits in put_buffer + * between calls, so 24 bits are sufficient. + */ + +LOCAL(void) +emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size) +/* Emit some bits, unless we are in gather mode */ +{ + /* This routine is heavily used, so it's worth coding tightly. */ + register size_t put_buffer = (size_t) code; + register int put_bits = entropy->put_bits; + + /* if size is 0, caller used an invalid Huffman table entry */ + if (size == 0) + ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE); + + if (entropy->gather_statistics) + return; /* do nothing if we're only getting stats */ + + put_buffer &= (((size_t) 1)<<size) - 1; /* mask off any extra bits in code */ + + put_bits += size; /* new number of bits in buffer */ + + put_buffer <<= 24 - put_bits; /* align incoming bits */ + + put_buffer |= entropy->put_buffer; /* and merge with old buffer contents */ + + while (put_bits >= 8) { + int c = (int) ((put_buffer >> 16) & 0xFF); + + emit_byte(entropy, c); + if (c == 0xFF) { /* need to stuff a zero byte? */ + emit_byte(entropy, 0); + } + put_buffer <<= 8; + put_bits -= 8; + } + + entropy->put_buffer = put_buffer; /* update variables */ + entropy->put_bits = put_bits; +} + + +LOCAL(void) +flush_bits (phuff_entropy_ptr entropy) +{ + emit_bits(entropy, 0x7F, 7); /* fill any partial byte with ones */ + entropy->put_buffer = 0; /* and reset bit-buffer to empty */ + entropy->put_bits = 0; +} + + +/* + * Emit (or just count) a Huffman symbol. + */ + +LOCAL(void) +emit_symbol (phuff_entropy_ptr entropy, int tbl_no, int symbol) +{ + if (entropy->gather_statistics) + entropy->count_ptrs[tbl_no][symbol]++; + else { + c_derived_tbl *tbl = entropy->derived_tbls[tbl_no]; + emit_bits(entropy, tbl->ehufco[symbol], tbl->ehufsi[symbol]); + } +} + + +/* + * Emit bits from a correction bit buffer. + */ + +LOCAL(void) +emit_buffered_bits (phuff_entropy_ptr entropy, char *bufstart, + unsigned int nbits) +{ + if (entropy->gather_statistics) + return; /* no real work */ + + while (nbits > 0) { + emit_bits(entropy, (unsigned int) (*bufstart), 1); + bufstart++; + nbits--; + } +} + + +/* + * Emit any pending EOBRUN symbol. + */ + +LOCAL(void) +emit_eobrun (phuff_entropy_ptr entropy) +{ + register int temp, nbits; + + if (entropy->EOBRUN > 0) { /* if there is any pending EOBRUN */ + temp = entropy->EOBRUN; + nbits = 0; + while ((temp >>= 1)) + nbits++; + /* safety check: shouldn't happen given limited correction-bit buffer */ + if (nbits > 14) + ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE); + + emit_symbol(entropy, entropy->ac_tbl_no, nbits << 4); + if (nbits) + emit_bits(entropy, entropy->EOBRUN, nbits); + + entropy->EOBRUN = 0; + + /* Emit any buffered correction bits */ + emit_buffered_bits(entropy, entropy->bit_buffer, entropy->BE); + entropy->BE = 0; + } +} + + +/* + * Emit a restart marker & resynchronize predictions. + */ + +LOCAL(void) +emit_restart (phuff_entropy_ptr entropy, int restart_num) +{ + int ci; + + emit_eobrun(entropy); + + if (! entropy->gather_statistics) { + flush_bits(entropy); + emit_byte(entropy, 0xFF); + emit_byte(entropy, JPEG_RST0 + restart_num); + } + + if (entropy->cinfo->Ss == 0) { + /* Re-initialize DC predictions to 0 */ + for (ci = 0; ci < entropy->cinfo->comps_in_scan; ci++) + entropy->last_dc_val[ci] = 0; + } else { + /* Re-initialize all AC-related fields to 0 */ + entropy->EOBRUN = 0; + entropy->BE = 0; + } +} + + +/* + * MCU encoding for DC initial scan (either spectral selection, + * or first pass of successive approximation). + */ + +METHODDEF(boolean) +encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data) +{ + phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; + register int temp, temp2; + register int nbits; + int blkn, ci; + int Al = cinfo->Al; + JBLOCKROW block; + jpeg_component_info *compptr; + ISHIFT_TEMPS + + entropy->next_output_byte = cinfo->dest->next_output_byte; + entropy->free_in_buffer = cinfo->dest->free_in_buffer; + + /* Emit restart marker if needed */ + if (cinfo->restart_interval) + if (entropy->restarts_to_go == 0) + emit_restart(entropy, entropy->next_restart_num); + + /* Encode the MCU data blocks */ + for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) { + block = MCU_data[blkn]; + ci = cinfo->MCU_membership[blkn]; + compptr = cinfo->cur_comp_info[ci]; + + /* Compute the DC value after the required point transform by Al. + * This is simply an arithmetic right shift. + */ + temp2 = IRIGHT_SHIFT((int) ((*block)[0]), Al); + + /* DC differences are figured on the point-transformed values. */ + temp = temp2 - entropy->last_dc_val[ci]; + entropy->last_dc_val[ci] = temp2; + + /* Encode the DC coefficient difference per section G.1.2.1 */ + temp2 = temp; + if (temp < 0) { + temp = -temp; /* temp is abs value of input */ + /* For a negative input, want temp2 = bitwise complement of abs(input) */ + /* This code assumes we are on a two's complement machine */ + temp2--; + } + + /* Find the number of bits needed for the magnitude of the coefficient */ + nbits = 0; + while (temp) { + nbits++; + temp >>= 1; + } + /* Check for out-of-range coefficient values. + * Since we're encoding a difference, the range limit is twice as much. + */ + if (nbits > MAX_COEF_BITS+1) + ERREXIT(cinfo, JERR_BAD_DCT_COEF); + + /* Count/emit the Huffman-coded symbol for the number of bits */ + emit_symbol(entropy, compptr->dc_tbl_no, nbits); + + /* Emit that number of bits of the value, if positive, */ + /* or the complement of its magnitude, if negative. */ + if (nbits) /* emit_bits rejects calls with size 0 */ + emit_bits(entropy, (unsigned int) temp2, nbits); + } + + cinfo->dest->next_output_byte = entropy->next_output_byte; + cinfo->dest->free_in_buffer = entropy->free_in_buffer; + + /* Update restart-interval state too */ + if (cinfo->restart_interval) { + if (entropy->restarts_to_go == 0) { + entropy->restarts_to_go = cinfo->restart_interval; + entropy->next_restart_num++; + entropy->next_restart_num &= 7; + } + entropy->restarts_to_go--; + } + + return TRUE; +} + + +/* + * MCU encoding for AC initial scan (either spectral selection, + * or first pass of successive approximation). + */ + +METHODDEF(boolean) +encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data) +{ + phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; + register int temp, temp2; + register int nbits; + register int r, k; + int Se = cinfo->Se; + int Al = cinfo->Al; + JBLOCKROW block; + + entropy->next_output_byte = cinfo->dest->next_output_byte; + entropy->free_in_buffer = cinfo->dest->free_in_buffer; + + /* Emit restart marker if needed */ + if (cinfo->restart_interval) + if (entropy->restarts_to_go == 0) + emit_restart(entropy, entropy->next_restart_num); + + /* Encode the MCU data block */ + block = MCU_data[0]; + + /* Encode the AC coefficients per section G.1.2.2, fig. G.3 */ + + r = 0; /* r = run length of zeros */ + + for (k = cinfo->Ss; k <= Se; k++) { + if ((temp = (*block)[jpeg_natural_order[k]]) == 0) { + r++; + continue; + } + /* We must apply the point transform by Al. For AC coefficients this + * is an integer division with rounding towards 0. To do this portably + * in C, we shift after obtaining the absolute value; so the code is + * interwoven with finding the abs value (temp) and output bits (temp2). + */ + if (temp < 0) { + temp = -temp; /* temp is abs value of input */ + temp >>= Al; /* apply the point transform */ + /* For a negative coef, want temp2 = bitwise complement of abs(coef) */ + temp2 = ~temp; + } else { + temp >>= Al; /* apply the point transform */ + temp2 = temp; + } + /* Watch out for case that nonzero coef is zero after point transform */ + if (temp == 0) { + r++; + continue; + } + + /* Emit any pending EOBRUN */ + if (entropy->EOBRUN > 0) + emit_eobrun(entropy); + /* if run length > 15, must emit special run-length-16 codes (0xF0) */ + while (r > 15) { + emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); + r -= 16; + } + + /* Find the number of bits needed for the magnitude of the coefficient */ + nbits = 1; /* there must be at least one 1 bit */ + while ((temp >>= 1)) + nbits++; + /* Check for out-of-range coefficient values */ + if (nbits > MAX_COEF_BITS) + ERREXIT(cinfo, JERR_BAD_DCT_COEF); + + /* Count/emit Huffman symbol for run length / number of bits */ + emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits); + + /* Emit that number of bits of the value, if positive, */ + /* or the complement of its magnitude, if negative. */ + emit_bits(entropy, (unsigned int) temp2, nbits); + + r = 0; /* reset zero run length */ + } + + if (r > 0) { /* If there are trailing zeroes, */ + entropy->EOBRUN++; /* count an EOB */ + if (entropy->EOBRUN == 0x7FFF) + emit_eobrun(entropy); /* force it out to avoid overflow */ + } + + cinfo->dest->next_output_byte = entropy->next_output_byte; + cinfo->dest->free_in_buffer = entropy->free_in_buffer; + + /* Update restart-interval state too */ + if (cinfo->restart_interval) { + if (entropy->restarts_to_go == 0) { + entropy->restarts_to_go = cinfo->restart_interval; + entropy->next_restart_num++; + entropy->next_restart_num &= 7; + } + entropy->restarts_to_go--; + } + + return TRUE; +} + + +/* + * MCU encoding for DC successive approximation refinement scan. + * Note: we assume such scans can be multi-component, although the spec + * is not very clear on the point. + */ + +METHODDEF(boolean) +encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data) +{ + phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; + register int temp; + int blkn; + int Al = cinfo->Al; + JBLOCKROW block; + + entropy->next_output_byte = cinfo->dest->next_output_byte; + entropy->free_in_buffer = cinfo->dest->free_in_buffer; + + /* Emit restart marker if needed */ + if (cinfo->restart_interval) + if (entropy->restarts_to_go == 0) + emit_restart(entropy, entropy->next_restart_num); + + /* Encode the MCU data blocks */ + for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) { + block = MCU_data[blkn]; + + /* We simply emit the Al'th bit of the DC coefficient value. */ + temp = (*block)[0]; + emit_bits(entropy, (unsigned int) (temp >> Al), 1); + } + + cinfo->dest->next_output_byte = entropy->next_output_byte; + cinfo->dest->free_in_buffer = entropy->free_in_buffer; + + /* Update restart-interval state too */ + if (cinfo->restart_interval) { + if (entropy->restarts_to_go == 0) { + entropy->restarts_to_go = cinfo->restart_interval; + entropy->next_restart_num++; + entropy->next_restart_num &= 7; + } + entropy->restarts_to_go--; + } + + return TRUE; +} + + +/* + * MCU encoding for AC successive approximation refinement scan. + */ + +METHODDEF(boolean) +encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data) +{ + phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; + register int temp; + register int r, k; + int EOB; + char *BR_buffer; + unsigned int BR; + int Se = cinfo->Se; + int Al = cinfo->Al; + JBLOCKROW block; + int absvalues[DCTSIZE2]; + + entropy->next_output_byte = cinfo->dest->next_output_byte; + entropy->free_in_buffer = cinfo->dest->free_in_buffer; + + /* Emit restart marker if needed */ + if (cinfo->restart_interval) + if (entropy->restarts_to_go == 0) + emit_restart(entropy, entropy->next_restart_num); + + /* Encode the MCU data block */ + block = MCU_data[0]; + + /* It is convenient to make a pre-pass to determine the transformed + * coefficients' absolute values and the EOB position. + */ + EOB = 0; + for (k = cinfo->Ss; k <= Se; k++) { + temp = (*block)[jpeg_natural_order[k]]; + /* We must apply the point transform by Al. For AC coefficients this + * is an integer division with rounding towards 0. To do this portably + * in C, we shift after obtaining the absolute value. + */ + if (temp < 0) + temp = -temp; /* temp is abs value of input */ + temp >>= Al; /* apply the point transform */ + absvalues[k] = temp; /* save abs value for main pass */ + if (temp == 1) + EOB = k; /* EOB = index of last newly-nonzero coef */ + } + + /* Encode the AC coefficients per section G.1.2.3, fig. G.7 */ + + r = 0; /* r = run length of zeros */ + BR = 0; /* BR = count of buffered bits added now */ + BR_buffer = entropy->bit_buffer + entropy->BE; /* Append bits to buffer */ + + for (k = cinfo->Ss; k <= Se; k++) { + if ((temp = absvalues[k]) == 0) { + r++; + continue; + } + + /* Emit any required ZRLs, but not if they can be folded into EOB */ + while (r > 15 && k <= EOB) { + /* emit any pending EOBRUN and the BE correction bits */ + emit_eobrun(entropy); + /* Emit ZRL */ + emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); + r -= 16; + /* Emit buffered correction bits that must be associated with ZRL */ + emit_buffered_bits(entropy, BR_buffer, BR); + BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ + BR = 0; + } + + /* If the coef was previously nonzero, it only needs a correction bit. + * NOTE: a straight translation of the spec's figure G.7 would suggest + * that we also need to test r > 15. But if r > 15, we can only get here + * if k > EOB, which implies that this coefficient is not 1. + */ + if (temp > 1) { + /* The correction bit is the next bit of the absolute value. */ + BR_buffer[BR++] = (char) (temp & 1); + continue; + } + + /* Emit any pending EOBRUN and the BE correction bits */ + emit_eobrun(entropy); + + /* Count/emit Huffman symbol for run length / number of bits */ + emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1); + + /* Emit output bit for newly-nonzero coef */ + temp = ((*block)[jpeg_natural_order[k]] < 0) ? 0 : 1; + emit_bits(entropy, (unsigned int) temp, 1); + + /* Emit buffered correction bits that must be associated with this code */ + emit_buffered_bits(entropy, BR_buffer, BR); + BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ + BR = 0; + r = 0; /* reset zero run length */ + } + + if (r > 0 || BR > 0) { /* If there are trailing zeroes, */ + entropy->EOBRUN++; /* count an EOB */ + entropy->BE += BR; /* concat my correction bits to older ones */ + /* We force out the EOB if we risk either: + * 1. overflow of the EOB counter; + * 2. overflow of the correction bit buffer during the next MCU. + */ + if (entropy->EOBRUN == 0x7FFF || entropy->BE > (MAX_CORR_BITS-DCTSIZE2+1)) + emit_eobrun(entropy); + } + + cinfo->dest->next_output_byte = entropy->next_output_byte; + cinfo->dest->free_in_buffer = entropy->free_in_buffer; + + /* Update restart-interval state too */ + if (cinfo->restart_interval) { + if (entropy->restarts_to_go == 0) { + entropy->restarts_to_go = cinfo->restart_interval; + entropy->next_restart_num++; + entropy->next_restart_num &= 7; + } + entropy->restarts_to_go--; + } + + return TRUE; +} + + +/* + * Finish up at the end of a Huffman-compressed progressive scan. + */ + +METHODDEF(void) +finish_pass_phuff (j_compress_ptr cinfo) +{ + phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; + + entropy->next_output_byte = cinfo->dest->next_output_byte; + entropy->free_in_buffer = cinfo->dest->free_in_buffer; + + /* Flush out any buffered data */ + emit_eobrun(entropy); + flush_bits(entropy); + + cinfo->dest->next_output_byte = entropy->next_output_byte; + cinfo->dest->free_in_buffer = entropy->free_in_buffer; +} + + +/* + * Finish up a statistics-gathering pass and create the new Huffman tables. + */ + +METHODDEF(void) +finish_pass_gather_phuff (j_compress_ptr cinfo) +{ + phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; + boolean is_DC_band; + int ci, tbl; + jpeg_component_info *compptr; + JHUFF_TBL **htblptr; + boolean did[NUM_HUFF_TBLS]; + + /* Flush out buffered data (all we care about is counting the EOB symbol) */ + emit_eobrun(entropy); + + is_DC_band = (cinfo->Ss == 0); + + /* It's important not to apply jpeg_gen_optimal_table more than once + * per table, because it clobbers the input frequency counts! + */ + MEMZERO(did, sizeof(did)); + + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { + compptr = cinfo->cur_comp_info[ci]; + if (is_DC_band) { + if (cinfo->Ah != 0) /* DC refinement needs no table */ + continue; + tbl = compptr->dc_tbl_no; + } else { + tbl = compptr->ac_tbl_no; + } + if (! did[tbl]) { + if (is_DC_band) + htblptr = & cinfo->dc_huff_tbl_ptrs[tbl]; + else + htblptr = & cinfo->ac_huff_tbl_ptrs[tbl]; + if (*htblptr == NULL) + *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo); + jpeg_gen_optimal_table(cinfo, *htblptr, entropy->count_ptrs[tbl]); + did[tbl] = TRUE; + } + } +} + + +/* + * Module initialization routine for progressive Huffman entropy encoding. + */ + +GLOBAL(void) +jinit_phuff_encoder (j_compress_ptr cinfo) +{ + phuff_entropy_ptr entropy; + int i; + + entropy = (phuff_entropy_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(phuff_entropy_encoder)); + cinfo->entropy = (struct jpeg_entropy_encoder *) entropy; + entropy->pub.start_pass = start_pass_phuff; + + /* Mark tables unallocated */ + for (i = 0; i < NUM_HUFF_TBLS; i++) { + entropy->derived_tbls[i] = NULL; + entropy->count_ptrs[i] = NULL; + } + entropy->bit_buffer = NULL; /* needed only in AC refinement scan */ +} + +#endif /* C_PROGRESSIVE_SUPPORTED */ diff --git a/media/libjpeg/jcprepct.c b/media/libjpeg/jcprepct.c new file mode 100644 index 000000000..e72ebd87d --- /dev/null +++ b/media/libjpeg/jcprepct.c @@ -0,0 +1,357 @@ +/* + * jcprepct.c + * + * This file is part of the Independent JPEG Group's software: + * Copyright (C) 1994-1996, Thomas G. Lane. + * It was modified by The libjpeg-turbo Project to include only code relevant + * to libjpeg-turbo. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains the compression preprocessing controller. + * This controller manages the color conversion, downsampling, + * and edge expansion steps. + * + * Most of the complexity here is associated with buffering input rows + * as required by the downsampler. See the comments at the head of + * jcsample.c for the downsampler's needs. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" + + +/* At present, jcsample.c can request context rows only for smoothing. + * In the future, we might also need context rows for CCIR601 sampling + * or other more-complex downsampling procedures. The code to support + * context rows should be compiled only if needed. + */ +#ifdef INPUT_SMOOTHING_SUPPORTED +#define CONTEXT_ROWS_SUPPORTED +#endif + + +/* + * For the simple (no-context-row) case, we just need to buffer one + * row group's worth of pixels for the downsampling step. At the bottom of + * the image, we pad to a full row group by replicating the last pixel row. + * The downsampler's last output row is then replicated if needed to pad + * out to a full iMCU row. + * + * When providing context rows, we must buffer three row groups' worth of + * pixels. Three row groups are physically allocated, but the row pointer + * arrays are made five row groups high, with the extra pointers above and + * below "wrapping around" to point to the last and first real row groups. + * This allows the downsampler to access the proper context rows. + * At the top and bottom of the image, we create dummy context rows by + * copying the first or last real pixel row. This copying could be avoided + * by pointer hacking as is done in jdmainct.c, but it doesn't seem worth the + * trouble on the compression side. + */ + + +/* Private buffer controller object */ + +typedef struct { + struct jpeg_c_prep_controller pub; /* public fields */ + + /* Downsampling input buffer. This buffer holds color-converted data + * until we have enough to do a downsample step. + */ + JSAMPARRAY color_buf[MAX_COMPONENTS]; + + JDIMENSION rows_to_go; /* counts rows remaining in source image */ + int next_buf_row; /* index of next row to store in color_buf */ + +#ifdef CONTEXT_ROWS_SUPPORTED /* only needed for context case */ + int this_row_group; /* starting row index of group to process */ + int next_buf_stop; /* downsample when we reach this index */ +#endif +} my_prep_controller; + +typedef my_prep_controller *my_prep_ptr; + + +/* + * Initialize for a processing pass. + */ + +METHODDEF(void) +start_pass_prep (j_compress_ptr cinfo, J_BUF_MODE pass_mode) +{ + my_prep_ptr prep = (my_prep_ptr) cinfo->prep; + + if (pass_mode != JBUF_PASS_THRU) + ERREXIT(cinfo, JERR_BAD_BUFFER_MODE); + + /* Initialize total-height counter for detecting bottom of image */ + prep->rows_to_go = cinfo->image_height; + /* Mark the conversion buffer empty */ + prep->next_buf_row = 0; +#ifdef CONTEXT_ROWS_SUPPORTED + /* Preset additional state variables for context mode. + * These aren't used in non-context mode, so we needn't test which mode. + */ + prep->this_row_group = 0; + /* Set next_buf_stop to stop after two row groups have been read in. */ + prep->next_buf_stop = 2 * cinfo->max_v_samp_factor; +#endif +} + + +/* + * Expand an image vertically from height input_rows to height output_rows, + * by duplicating the bottom row. + */ + +LOCAL(void) +expand_bottom_edge (JSAMPARRAY image_data, JDIMENSION num_cols, + int input_rows, int output_rows) +{ + register int row; + + for (row = input_rows; row < output_rows; row++) { + jcopy_sample_rows(image_data, input_rows-1, image_data, row, + 1, num_cols); + } +} + + +/* + * Process some data in the simple no-context case. + * + * Preprocessor output data is counted in "row groups". A row group + * is defined to be v_samp_factor sample rows of each component. + * Downsampling will produce this much data from each max_v_samp_factor + * input rows. + */ + +METHODDEF(void) +pre_process_data (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JDIMENSION *in_row_ctr, + JDIMENSION in_rows_avail, + JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr, + JDIMENSION out_row_groups_avail) +{ + my_prep_ptr prep = (my_prep_ptr) cinfo->prep; + int numrows, ci; + JDIMENSION inrows; + jpeg_component_info *compptr; + + while (*in_row_ctr < in_rows_avail && + *out_row_group_ctr < out_row_groups_avail) { + /* Do color conversion to fill the conversion buffer. */ + inrows = in_rows_avail - *in_row_ctr; + numrows = cinfo->max_v_samp_factor - prep->next_buf_row; + numrows = (int) MIN((JDIMENSION) numrows, inrows); + (*cinfo->cconvert->color_convert) (cinfo, input_buf + *in_row_ctr, + prep->color_buf, + (JDIMENSION) prep->next_buf_row, + numrows); + *in_row_ctr += numrows; + prep->next_buf_row += numrows; + prep->rows_to_go -= numrows; + /* If at bottom of image, pad to fill the conversion buffer. */ + if (prep->rows_to_go == 0 && + prep->next_buf_row < cinfo->max_v_samp_factor) { + for (ci = 0; ci < cinfo->num_components; ci++) { + expand_bottom_edge(prep->color_buf[ci], cinfo->image_width, + prep->next_buf_row, cinfo->max_v_samp_factor); + } + prep->next_buf_row = cinfo->max_v_samp_factor; + } + /* If we've filled the conversion buffer, empty it. */ + if (prep->next_buf_row == cinfo->max_v_samp_factor) { + (*cinfo->downsample->downsample) (cinfo, + prep->color_buf, (JDIMENSION) 0, + output_buf, *out_row_group_ctr); + prep->next_buf_row = 0; + (*out_row_group_ctr)++; + } + /* If at bottom of image, pad the output to a full iMCU height. + * Note we assume the caller is providing a one-iMCU-height output buffer! + */ + if (prep->rows_to_go == 0 && + *out_row_group_ctr < out_row_groups_avail) { + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + expand_bottom_edge(output_buf[ci], + compptr->width_in_blocks * DCTSIZE, + (int) (*out_row_group_ctr * compptr->v_samp_factor), + (int) (out_row_groups_avail * compptr->v_samp_factor)); + } + *out_row_group_ctr = out_row_groups_avail; + break; /* can exit outer loop without test */ + } + } +} + + +#ifdef CONTEXT_ROWS_SUPPORTED + +/* + * Process some data in the context case. + */ + +METHODDEF(void) +pre_process_context (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JDIMENSION *in_row_ctr, + JDIMENSION in_rows_avail, + JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr, + JDIMENSION out_row_groups_avail) +{ + my_prep_ptr prep = (my_prep_ptr) cinfo->prep; + int numrows, ci; + int buf_height = cinfo->max_v_samp_factor * 3; + JDIMENSION inrows; + + while (*out_row_group_ctr < out_row_groups_avail) { + if (*in_row_ctr < in_rows_avail) { + /* Do color conversion to fill the conversion buffer. */ + inrows = in_rows_avail - *in_row_ctr; + numrows = prep->next_buf_stop - prep->next_buf_row; + numrows = (int) MIN((JDIMENSION) numrows, inrows); + (*cinfo->cconvert->color_convert) (cinfo, input_buf + *in_row_ctr, + prep->color_buf, + (JDIMENSION) prep->next_buf_row, + numrows); + /* Pad at top of image, if first time through */ + if (prep->rows_to_go == cinfo->image_height) { + for (ci = 0; ci < cinfo->num_components; ci++) { + int row; + for (row = 1; row <= cinfo->max_v_samp_factor; row++) { + jcopy_sample_rows(prep->color_buf[ci], 0, + prep->color_buf[ci], -row, + 1, cinfo->image_width); + } + } + } + *in_row_ctr += numrows; + prep->next_buf_row += numrows; + prep->rows_to_go -= numrows; + } else { + /* Return for more data, unless we are at the bottom of the image. */ + if (prep->rows_to_go != 0) + break; + /* When at bottom of image, pad to fill the conversion buffer. */ + if (prep->next_buf_row < prep->next_buf_stop) { + for (ci = 0; ci < cinfo->num_components; ci++) { + expand_bottom_edge(prep->color_buf[ci], cinfo->image_width, + prep->next_buf_row, prep->next_buf_stop); + } + prep->next_buf_row = prep->next_buf_stop; + } + } + /* If we've gotten enough data, downsample a row group. */ + if (prep->next_buf_row == prep->next_buf_stop) { + (*cinfo->downsample->downsample) (cinfo, + prep->color_buf, + (JDIMENSION) prep->this_row_group, + output_buf, *out_row_group_ctr); + (*out_row_group_ctr)++; + /* Advance pointers with wraparound as necessary. */ + prep->this_row_group += cinfo->max_v_samp_factor; + if (prep->this_row_group >= buf_height) + prep->this_row_group = 0; + if (prep->next_buf_row >= buf_height) + prep->next_buf_row = 0; + prep->next_buf_stop = prep->next_buf_row + cinfo->max_v_samp_factor; + } + } +} + + +/* + * Create the wrapped-around downsampling input buffer needed for context mode. + */ + +LOCAL(void) +create_context_buffer (j_compress_ptr cinfo) +{ + my_prep_ptr prep = (my_prep_ptr) cinfo->prep; + int rgroup_height = cinfo->max_v_samp_factor; + int ci, i; + jpeg_component_info *compptr; + JSAMPARRAY true_buffer, fake_buffer; + + /* Grab enough space for fake row pointers for all the components; + * we need five row groups' worth of pointers for each component. + */ + fake_buffer = (JSAMPARRAY) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + (cinfo->num_components * 5 * rgroup_height) * + sizeof(JSAMPROW)); + + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + /* Allocate the actual buffer space (3 row groups) for this component. + * We make the buffer wide enough to allow the downsampler to edge-expand + * horizontally within the buffer, if it so chooses. + */ + true_buffer = (*cinfo->mem->alloc_sarray) + ((j_common_ptr) cinfo, JPOOL_IMAGE, + (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE * + cinfo->max_h_samp_factor) / compptr->h_samp_factor), + (JDIMENSION) (3 * rgroup_height)); + /* Copy true buffer row pointers into the middle of the fake row array */ + MEMCOPY(fake_buffer + rgroup_height, true_buffer, + 3 * rgroup_height * sizeof(JSAMPROW)); + /* Fill in the above and below wraparound pointers */ + for (i = 0; i < rgroup_height; i++) { + fake_buffer[i] = true_buffer[2 * rgroup_height + i]; + fake_buffer[4 * rgroup_height + i] = true_buffer[i]; + } + prep->color_buf[ci] = fake_buffer + rgroup_height; + fake_buffer += 5 * rgroup_height; /* point to space for next component */ + } +} + +#endif /* CONTEXT_ROWS_SUPPORTED */ + + +/* + * Initialize preprocessing controller. + */ + +GLOBAL(void) +jinit_c_prep_controller (j_compress_ptr cinfo, boolean need_full_buffer) +{ + my_prep_ptr prep; + int ci; + jpeg_component_info *compptr; + + if (need_full_buffer) /* safety check */ + ERREXIT(cinfo, JERR_BAD_BUFFER_MODE); + + prep = (my_prep_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(my_prep_controller)); + cinfo->prep = (struct jpeg_c_prep_controller *) prep; + prep->pub.start_pass = start_pass_prep; + + /* Allocate the color conversion buffer. + * We make the buffer wide enough to allow the downsampler to edge-expand + * horizontally within the buffer, if it so chooses. + */ + if (cinfo->downsample->need_context_rows) { + /* Set up to provide context rows */ +#ifdef CONTEXT_ROWS_SUPPORTED + prep->pub.pre_process_data = pre_process_context; + create_context_buffer(cinfo); +#else + ERREXIT(cinfo, JERR_NOT_COMPILED); +#endif + } else { + /* No context, just make it tall enough for one row group */ + prep->pub.pre_process_data = pre_process_data; + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + prep->color_buf[ci] = (*cinfo->mem->alloc_sarray) + ((j_common_ptr) cinfo, JPOOL_IMAGE, + (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE * + cinfo->max_h_samp_factor) / compptr->h_samp_factor), + (JDIMENSION) cinfo->max_v_samp_factor); + } + } +} diff --git a/media/libjpeg/jcsample.c b/media/libjpeg/jcsample.c new file mode 100644 index 000000000..c4b499148 --- /dev/null +++ b/media/libjpeg/jcsample.c @@ -0,0 +1,539 @@ +/* + * jcsample.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1996, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2014, MIPS Technologies, Inc., California. + * Copyright (C) 2015, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains downsampling routines. + * + * Downsampling input data is counted in "row groups". A row group + * is defined to be max_v_samp_factor pixel rows of each component, + * from which the downsampler produces v_samp_factor sample rows. + * A single row group is processed in each call to the downsampler module. + * + * The downsampler is responsible for edge-expansion of its output data + * to fill an integral number of DCT blocks horizontally. The source buffer + * may be modified if it is helpful for this purpose (the source buffer is + * allocated wide enough to correspond to the desired output width). + * The caller (the prep controller) is responsible for vertical padding. + * + * The downsampler may request "context rows" by setting need_context_rows + * during startup. In this case, the input arrays will contain at least + * one row group's worth of pixels above and below the passed-in data; + * the caller will create dummy rows at image top and bottom by replicating + * the first or last real pixel row. + * + * An excellent reference for image resampling is + * Digital Image Warping, George Wolberg, 1990. + * Pub. by IEEE Computer Society Press, Los Alamitos, CA. ISBN 0-8186-8944-7. + * + * The downsampling algorithm used here is a simple average of the source + * pixels covered by the output pixel. The hi-falutin sampling literature + * refers to this as a "box filter". In general the characteristics of a box + * filter are not very good, but for the specific cases we normally use (1:1 + * and 2:1 ratios) the box is equivalent to a "triangle filter" which is not + * nearly so bad. If you intend to use other sampling ratios, you'd be well + * advised to improve this code. + * + * A simple input-smoothing capability is provided. This is mainly intended + * for cleaning up color-dithered GIF input files (if you find it inadequate, + * we suggest using an external filtering program such as pnmconvol). When + * enabled, each input pixel P is replaced by a weighted sum of itself and its + * eight neighbors. P's weight is 1-8*SF and each neighbor's weight is SF, + * where SF = (smoothing_factor / 1024). + * Currently, smoothing is only supported for 2h2v sampling factors. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jsimd.h" + + +/* Pointer to routine to downsample a single component */ +typedef void (*downsample1_ptr) (j_compress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY output_data); + +/* Private subobject */ + +typedef struct { + struct jpeg_downsampler pub; /* public fields */ + + /* Downsampling method pointers, one per component */ + downsample1_ptr methods[MAX_COMPONENTS]; +} my_downsampler; + +typedef my_downsampler *my_downsample_ptr; + + +/* + * Initialize for a downsampling pass. + */ + +METHODDEF(void) +start_pass_downsample (j_compress_ptr cinfo) +{ + /* no work for now */ +} + + +/* + * Expand a component horizontally from width input_cols to width output_cols, + * by duplicating the rightmost samples. + */ + +LOCAL(void) +expand_right_edge (JSAMPARRAY image_data, int num_rows, + JDIMENSION input_cols, JDIMENSION output_cols) +{ + register JSAMPROW ptr; + register JSAMPLE pixval; + register int count; + int row; + int numcols = (int) (output_cols - input_cols); + + if (numcols > 0) { + for (row = 0; row < num_rows; row++) { + ptr = image_data[row] + input_cols; + pixval = ptr[-1]; /* don't need GETJSAMPLE() here */ + for (count = numcols; count > 0; count--) + *ptr++ = pixval; + } + } +} + + +/* + * Do downsampling for a whole row group (all components). + * + * In this version we simply downsample each component independently. + */ + +METHODDEF(void) +sep_downsample (j_compress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION in_row_index, + JSAMPIMAGE output_buf, JDIMENSION out_row_group_index) +{ + my_downsample_ptr downsample = (my_downsample_ptr) cinfo->downsample; + int ci; + jpeg_component_info *compptr; + JSAMPARRAY in_ptr, out_ptr; + + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + in_ptr = input_buf[ci] + in_row_index; + out_ptr = output_buf[ci] + (out_row_group_index * compptr->v_samp_factor); + (*downsample->methods[ci]) (cinfo, compptr, in_ptr, out_ptr); + } +} + + +/* + * Downsample pixel values of a single component. + * One row group is processed per call. + * This version handles arbitrary integral sampling ratios, without smoothing. + * Note that this version is not actually used for customary sampling ratios. + */ + +METHODDEF(void) +int_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + int inrow, outrow, h_expand, v_expand, numpix, numpix2, h, v; + JDIMENSION outcol, outcol_h; /* outcol_h == outcol*h_expand */ + JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE; + JSAMPROW inptr, outptr; + JLONG outvalue; + + h_expand = cinfo->max_h_samp_factor / compptr->h_samp_factor; + v_expand = cinfo->max_v_samp_factor / compptr->v_samp_factor; + numpix = h_expand * v_expand; + numpix2 = numpix/2; + + /* Expand input data enough to let all the output samples be generated + * by the standard loop. Special-casing padded output would be more + * efficient. + */ + expand_right_edge(input_data, cinfo->max_v_samp_factor, + cinfo->image_width, output_cols * h_expand); + + inrow = 0; + for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) { + outptr = output_data[outrow]; + for (outcol = 0, outcol_h = 0; outcol < output_cols; + outcol++, outcol_h += h_expand) { + outvalue = 0; + for (v = 0; v < v_expand; v++) { + inptr = input_data[inrow+v] + outcol_h; + for (h = 0; h < h_expand; h++) { + outvalue += (JLONG) GETJSAMPLE(*inptr++); + } + } + *outptr++ = (JSAMPLE) ((outvalue + numpix2) / numpix); + } + inrow += v_expand; + } +} + + +/* + * Downsample pixel values of a single component. + * This version handles the special case of a full-size component, + * without smoothing. + */ + +METHODDEF(void) +fullsize_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + /* Copy the data */ + jcopy_sample_rows(input_data, 0, output_data, 0, + cinfo->max_v_samp_factor, cinfo->image_width); + /* Edge-expand */ + expand_right_edge(output_data, cinfo->max_v_samp_factor, + cinfo->image_width, compptr->width_in_blocks * DCTSIZE); +} + + +/* + * Downsample pixel values of a single component. + * This version handles the common case of 2:1 horizontal and 1:1 vertical, + * without smoothing. + * + * A note about the "bias" calculations: when rounding fractional values to + * integer, we do not want to always round 0.5 up to the next integer. + * If we did that, we'd introduce a noticeable bias towards larger values. + * Instead, this code is arranged so that 0.5 will be rounded up or down at + * alternate pixel locations (a simple ordered dither pattern). + */ + +METHODDEF(void) +h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + int outrow; + JDIMENSION outcol; + JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE; + register JSAMPROW inptr, outptr; + register int bias; + + /* Expand input data enough to let all the output samples be generated + * by the standard loop. Special-casing padded output would be more + * efficient. + */ + expand_right_edge(input_data, cinfo->max_v_samp_factor, + cinfo->image_width, output_cols * 2); + + for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) { + outptr = output_data[outrow]; + inptr = input_data[outrow]; + bias = 0; /* bias = 0,1,0,1,... for successive samples */ + for (outcol = 0; outcol < output_cols; outcol++) { + *outptr++ = (JSAMPLE) ((GETJSAMPLE(*inptr) + GETJSAMPLE(inptr[1]) + + bias) >> 1); + bias ^= 1; /* 0=>1, 1=>0 */ + inptr += 2; + } + } +} + + +/* + * Downsample pixel values of a single component. + * This version handles the standard case of 2:1 horizontal and 2:1 vertical, + * without smoothing. + */ + +METHODDEF(void) +h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + int inrow, outrow; + JDIMENSION outcol; + JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE; + register JSAMPROW inptr0, inptr1, outptr; + register int bias; + + /* Expand input data enough to let all the output samples be generated + * by the standard loop. Special-casing padded output would be more + * efficient. + */ + expand_right_edge(input_data, cinfo->max_v_samp_factor, + cinfo->image_width, output_cols * 2); + + inrow = 0; + for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) { + outptr = output_data[outrow]; + inptr0 = input_data[inrow]; + inptr1 = input_data[inrow+1]; + bias = 1; /* bias = 1,2,1,2,... for successive samples */ + for (outcol = 0; outcol < output_cols; outcol++) { + *outptr++ = (JSAMPLE) ((GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) + + GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]) + + bias) >> 2); + bias ^= 3; /* 1=>2, 2=>1 */ + inptr0 += 2; inptr1 += 2; + } + inrow += 2; + } +} + + +#ifdef INPUT_SMOOTHING_SUPPORTED + +/* + * Downsample pixel values of a single component. + * This version handles the standard case of 2:1 horizontal and 2:1 vertical, + * with smoothing. One row of context is required. + */ + +METHODDEF(void) +h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + int inrow, outrow; + JDIMENSION colctr; + JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE; + register JSAMPROW inptr0, inptr1, above_ptr, below_ptr, outptr; + JLONG membersum, neighsum, memberscale, neighscale; + + /* Expand input data enough to let all the output samples be generated + * by the standard loop. Special-casing padded output would be more + * efficient. + */ + expand_right_edge(input_data - 1, cinfo->max_v_samp_factor + 2, + cinfo->image_width, output_cols * 2); + + /* We don't bother to form the individual "smoothed" input pixel values; + * we can directly compute the output which is the average of the four + * smoothed values. Each of the four member pixels contributes a fraction + * (1-8*SF) to its own smoothed image and a fraction SF to each of the three + * other smoothed pixels, therefore a total fraction (1-5*SF)/4 to the final + * output. The four corner-adjacent neighbor pixels contribute a fraction + * SF to just one smoothed pixel, or SF/4 to the final output; while the + * eight edge-adjacent neighbors contribute SF to each of two smoothed + * pixels, or SF/2 overall. In order to use integer arithmetic, these + * factors are scaled by 2^16 = 65536. + * Also recall that SF = smoothing_factor / 1024. + */ + + memberscale = 16384 - cinfo->smoothing_factor * 80; /* scaled (1-5*SF)/4 */ + neighscale = cinfo->smoothing_factor * 16; /* scaled SF/4 */ + + inrow = 0; + for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) { + outptr = output_data[outrow]; + inptr0 = input_data[inrow]; + inptr1 = input_data[inrow+1]; + above_ptr = input_data[inrow-1]; + below_ptr = input_data[inrow+2]; + + /* Special case for first column: pretend column -1 is same as column 0 */ + membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) + + GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]); + neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) + + GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) + + GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[2]) + + GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[2]); + neighsum += neighsum; + neighsum += GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[2]) + + GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[2]); + membersum = membersum * memberscale + neighsum * neighscale; + *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16); + inptr0 += 2; inptr1 += 2; above_ptr += 2; below_ptr += 2; + + for (colctr = output_cols - 2; colctr > 0; colctr--) { + /* sum of pixels directly mapped to this output element */ + membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) + + GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]); + /* sum of edge-neighbor pixels */ + neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) + + GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) + + GETJSAMPLE(inptr0[-1]) + GETJSAMPLE(inptr0[2]) + + GETJSAMPLE(inptr1[-1]) + GETJSAMPLE(inptr1[2]); + /* The edge-neighbors count twice as much as corner-neighbors */ + neighsum += neighsum; + /* Add in the corner-neighbors */ + neighsum += GETJSAMPLE(above_ptr[-1]) + GETJSAMPLE(above_ptr[2]) + + GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[2]); + /* form final output scaled up by 2^16 */ + membersum = membersum * memberscale + neighsum * neighscale; + /* round, descale and output it */ + *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16); + inptr0 += 2; inptr1 += 2; above_ptr += 2; below_ptr += 2; + } + + /* Special case for last column */ + membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) + + GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]); + neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) + + GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) + + GETJSAMPLE(inptr0[-1]) + GETJSAMPLE(inptr0[1]) + + GETJSAMPLE(inptr1[-1]) + GETJSAMPLE(inptr1[1]); + neighsum += neighsum; + neighsum += GETJSAMPLE(above_ptr[-1]) + GETJSAMPLE(above_ptr[1]) + + GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[1]); + membersum = membersum * memberscale + neighsum * neighscale; + *outptr = (JSAMPLE) ((membersum + 32768) >> 16); + + inrow += 2; + } +} + + +/* + * Downsample pixel values of a single component. + * This version handles the special case of a full-size component, + * with smoothing. One row of context is required. + */ + +METHODDEF(void) +fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + int outrow; + JDIMENSION colctr; + JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE; + register JSAMPROW inptr, above_ptr, below_ptr, outptr; + JLONG membersum, neighsum, memberscale, neighscale; + int colsum, lastcolsum, nextcolsum; + + /* Expand input data enough to let all the output samples be generated + * by the standard loop. Special-casing padded output would be more + * efficient. + */ + expand_right_edge(input_data - 1, cinfo->max_v_samp_factor + 2, + cinfo->image_width, output_cols); + + /* Each of the eight neighbor pixels contributes a fraction SF to the + * smoothed pixel, while the main pixel contributes (1-8*SF). In order + * to use integer arithmetic, these factors are multiplied by 2^16 = 65536. + * Also recall that SF = smoothing_factor / 1024. + */ + + memberscale = 65536L - cinfo->smoothing_factor * 512L; /* scaled 1-8*SF */ + neighscale = cinfo->smoothing_factor * 64; /* scaled SF */ + + for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) { + outptr = output_data[outrow]; + inptr = input_data[outrow]; + above_ptr = input_data[outrow-1]; + below_ptr = input_data[outrow+1]; + + /* Special case for first column */ + colsum = GETJSAMPLE(*above_ptr++) + GETJSAMPLE(*below_ptr++) + + GETJSAMPLE(*inptr); + membersum = GETJSAMPLE(*inptr++); + nextcolsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(*below_ptr) + + GETJSAMPLE(*inptr); + neighsum = colsum + (colsum - membersum) + nextcolsum; + membersum = membersum * memberscale + neighsum * neighscale; + *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16); + lastcolsum = colsum; colsum = nextcolsum; + + for (colctr = output_cols - 2; colctr > 0; colctr--) { + membersum = GETJSAMPLE(*inptr++); + above_ptr++; below_ptr++; + nextcolsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(*below_ptr) + + GETJSAMPLE(*inptr); + neighsum = lastcolsum + (colsum - membersum) + nextcolsum; + membersum = membersum * memberscale + neighsum * neighscale; + *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16); + lastcolsum = colsum; colsum = nextcolsum; + } + + /* Special case for last column */ + membersum = GETJSAMPLE(*inptr); + neighsum = lastcolsum + (colsum - membersum) + colsum; + membersum = membersum * memberscale + neighsum * neighscale; + *outptr = (JSAMPLE) ((membersum + 32768) >> 16); + + } +} + +#endif /* INPUT_SMOOTHING_SUPPORTED */ + + +/* + * Module initialization routine for downsampling. + * Note that we must select a routine for each component. + */ + +GLOBAL(void) +jinit_downsampler (j_compress_ptr cinfo) +{ + my_downsample_ptr downsample; + int ci; + jpeg_component_info *compptr; + boolean smoothok = TRUE; + + downsample = (my_downsample_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(my_downsampler)); + cinfo->downsample = (struct jpeg_downsampler *) downsample; + downsample->pub.start_pass = start_pass_downsample; + downsample->pub.downsample = sep_downsample; + downsample->pub.need_context_rows = FALSE; + + if (cinfo->CCIR601_sampling) + ERREXIT(cinfo, JERR_CCIR601_NOTIMPL); + + /* Verify we can handle the sampling factors, and set up method pointers */ + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + if (compptr->h_samp_factor == cinfo->max_h_samp_factor && + compptr->v_samp_factor == cinfo->max_v_samp_factor) { +#ifdef INPUT_SMOOTHING_SUPPORTED + if (cinfo->smoothing_factor) { + downsample->methods[ci] = fullsize_smooth_downsample; + downsample->pub.need_context_rows = TRUE; + } else +#endif + downsample->methods[ci] = fullsize_downsample; + } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor && + compptr->v_samp_factor == cinfo->max_v_samp_factor) { + smoothok = FALSE; + if (jsimd_can_h2v1_downsample()) + downsample->methods[ci] = jsimd_h2v1_downsample; + else + downsample->methods[ci] = h2v1_downsample; + } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor && + compptr->v_samp_factor * 2 == cinfo->max_v_samp_factor) { +#ifdef INPUT_SMOOTHING_SUPPORTED + if (cinfo->smoothing_factor) { +#if defined(__mips__) + if (jsimd_can_h2v2_smooth_downsample()) + downsample->methods[ci] = jsimd_h2v2_smooth_downsample; + else +#endif + downsample->methods[ci] = h2v2_smooth_downsample; + downsample->pub.need_context_rows = TRUE; + } else +#endif + { + if (jsimd_can_h2v2_downsample()) + downsample->methods[ci] = jsimd_h2v2_downsample; + else + downsample->methods[ci] = h2v2_downsample; + } + } else if ((cinfo->max_h_samp_factor % compptr->h_samp_factor) == 0 && + (cinfo->max_v_samp_factor % compptr->v_samp_factor) == 0) { + smoothok = FALSE; + downsample->methods[ci] = int_downsample; + } else + ERREXIT(cinfo, JERR_FRACT_SAMPLE_NOTIMPL); + } + +#ifdef INPUT_SMOOTHING_SUPPORTED + if (cinfo->smoothing_factor && !smoothok) + TRACEMS(cinfo, 0, JTRC_SMOOTH_NOTIMPL); +#endif +} diff --git a/media/libjpeg/jctrans.c b/media/libjpeg/jctrans.c new file mode 100644 index 000000000..6f16b052c --- /dev/null +++ b/media/libjpeg/jctrans.c @@ -0,0 +1,402 @@ +/* + * jctrans.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1995-1998, Thomas G. Lane. + * Modified 2000-2009 by Guido Vollbeding. + * It was modified by The libjpeg-turbo Project to include only code relevant + * to libjpeg-turbo. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains library routines for transcoding compression, + * that is, writing raw DCT coefficient arrays to an output JPEG file. + * The routines in jcapimin.c will also be needed by a transcoder. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" + + +/* Forward declarations */ +LOCAL(void) transencode_master_selection + (j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays); +LOCAL(void) transencode_coef_controller + (j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays); + + +/* + * Compression initialization for writing raw-coefficient data. + * Before calling this, all parameters and a data destination must be set up. + * Call jpeg_finish_compress() to actually write the data. + * + * The number of passed virtual arrays must match cinfo->num_components. + * Note that the virtual arrays need not be filled or even realized at + * the time write_coefficients is called; indeed, if the virtual arrays + * were requested from this compression object's memory manager, they + * typically will be realized during this routine and filled afterwards. + */ + +GLOBAL(void) +jpeg_write_coefficients (j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays) +{ + if (cinfo->global_state != CSTATE_START) + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + /* Mark all tables to be written */ + jpeg_suppress_tables(cinfo, FALSE); + /* (Re)initialize error mgr and destination modules */ + (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo); + (*cinfo->dest->init_destination) (cinfo); + /* Perform master selection of active modules */ + transencode_master_selection(cinfo, coef_arrays); + /* Wait for jpeg_finish_compress() call */ + cinfo->next_scanline = 0; /* so jpeg_write_marker works */ + cinfo->global_state = CSTATE_WRCOEFS; +} + + +/* + * Initialize the compression object with default parameters, + * then copy from the source object all parameters needed for lossless + * transcoding. Parameters that can be varied without loss (such as + * scan script and Huffman optimization) are left in their default states. + */ + +GLOBAL(void) +jpeg_copy_critical_parameters (j_decompress_ptr srcinfo, + j_compress_ptr dstinfo) +{ + JQUANT_TBL **qtblptr; + jpeg_component_info *incomp, *outcomp; + JQUANT_TBL *c_quant, *slot_quant; + int tblno, ci, coefi; + + /* Safety check to ensure start_compress not called yet. */ + if (dstinfo->global_state != CSTATE_START) + ERREXIT1(dstinfo, JERR_BAD_STATE, dstinfo->global_state); + /* Copy fundamental image dimensions */ + dstinfo->image_width = srcinfo->image_width; + dstinfo->image_height = srcinfo->image_height; + dstinfo->input_components = srcinfo->num_components; + dstinfo->in_color_space = srcinfo->jpeg_color_space; +#if JPEG_LIB_VERSION >= 70 + dstinfo->jpeg_width = srcinfo->output_width; + dstinfo->jpeg_height = srcinfo->output_height; + dstinfo->min_DCT_h_scaled_size = srcinfo->min_DCT_h_scaled_size; + dstinfo->min_DCT_v_scaled_size = srcinfo->min_DCT_v_scaled_size; +#endif + /* Initialize all parameters to default values */ + jpeg_set_defaults(dstinfo); + /* jpeg_set_defaults may choose wrong colorspace, eg YCbCr if input is RGB. + * Fix it to get the right header markers for the image colorspace. + */ + jpeg_set_colorspace(dstinfo, srcinfo->jpeg_color_space); + dstinfo->data_precision = srcinfo->data_precision; + dstinfo->CCIR601_sampling = srcinfo->CCIR601_sampling; + /* Copy the source's quantization tables. */ + for (tblno = 0; tblno < NUM_QUANT_TBLS; tblno++) { + if (srcinfo->quant_tbl_ptrs[tblno] != NULL) { + qtblptr = & dstinfo->quant_tbl_ptrs[tblno]; + if (*qtblptr == NULL) + *qtblptr = jpeg_alloc_quant_table((j_common_ptr) dstinfo); + MEMCOPY((*qtblptr)->quantval, + srcinfo->quant_tbl_ptrs[tblno]->quantval, + sizeof((*qtblptr)->quantval)); + (*qtblptr)->sent_table = FALSE; + } + } + /* Copy the source's per-component info. + * Note we assume jpeg_set_defaults has allocated the dest comp_info array. + */ + dstinfo->num_components = srcinfo->num_components; + if (dstinfo->num_components < 1 || dstinfo->num_components > MAX_COMPONENTS) + ERREXIT2(dstinfo, JERR_COMPONENT_COUNT, dstinfo->num_components, + MAX_COMPONENTS); + for (ci = 0, incomp = srcinfo->comp_info, outcomp = dstinfo->comp_info; + ci < dstinfo->num_components; ci++, incomp++, outcomp++) { + outcomp->component_id = incomp->component_id; + outcomp->h_samp_factor = incomp->h_samp_factor; + outcomp->v_samp_factor = incomp->v_samp_factor; + outcomp->quant_tbl_no = incomp->quant_tbl_no; + /* Make sure saved quantization table for component matches the qtable + * slot. If not, the input file re-used this qtable slot. + * IJG encoder currently cannot duplicate this. + */ + tblno = outcomp->quant_tbl_no; + if (tblno < 0 || tblno >= NUM_QUANT_TBLS || + srcinfo->quant_tbl_ptrs[tblno] == NULL) + ERREXIT1(dstinfo, JERR_NO_QUANT_TABLE, tblno); + slot_quant = srcinfo->quant_tbl_ptrs[tblno]; + c_quant = incomp->quant_table; + if (c_quant != NULL) { + for (coefi = 0; coefi < DCTSIZE2; coefi++) { + if (c_quant->quantval[coefi] != slot_quant->quantval[coefi]) + ERREXIT1(dstinfo, JERR_MISMATCHED_QUANT_TABLE, tblno); + } + } + /* Note: we do not copy the source's Huffman table assignments; + * instead we rely on jpeg_set_colorspace to have made a suitable choice. + */ + } + /* Also copy JFIF version and resolution information, if available. + * Strictly speaking this isn't "critical" info, but it's nearly + * always appropriate to copy it if available. In particular, + * if the application chooses to copy JFIF 1.02 extension markers from + * the source file, we need to copy the version to make sure we don't + * emit a file that has 1.02 extensions but a claimed version of 1.01. + * We will *not*, however, copy version info from mislabeled "2.01" files. + */ + if (srcinfo->saw_JFIF_marker) { + if (srcinfo->JFIF_major_version == 1) { + dstinfo->JFIF_major_version = srcinfo->JFIF_major_version; + dstinfo->JFIF_minor_version = srcinfo->JFIF_minor_version; + } + dstinfo->density_unit = srcinfo->density_unit; + dstinfo->X_density = srcinfo->X_density; + dstinfo->Y_density = srcinfo->Y_density; + } +} + + +/* + * Master selection of compression modules for transcoding. + * This substitutes for jcinit.c's initialization of the full compressor. + */ + +LOCAL(void) +transencode_master_selection (j_compress_ptr cinfo, + jvirt_barray_ptr *coef_arrays) +{ + /* Although we don't actually use input_components for transcoding, + * jcmaster.c's initial_setup will complain if input_components is 0. + */ + cinfo->input_components = 1; + /* Initialize master control (includes parameter checking/processing) */ + jinit_c_master_control(cinfo, TRUE /* transcode only */); + + /* Entropy encoding: either Huffman or arithmetic coding. */ + if (cinfo->arith_code) { +#ifdef C_ARITH_CODING_SUPPORTED + jinit_arith_encoder(cinfo); +#else + ERREXIT(cinfo, JERR_ARITH_NOTIMPL); +#endif + } else { + if (cinfo->progressive_mode) { +#ifdef C_PROGRESSIVE_SUPPORTED + jinit_phuff_encoder(cinfo); +#else + ERREXIT(cinfo, JERR_NOT_COMPILED); +#endif + } else + jinit_huff_encoder(cinfo); + } + + /* We need a special coefficient buffer controller. */ + transencode_coef_controller(cinfo, coef_arrays); + + jinit_marker_writer(cinfo); + + /* We can now tell the memory manager to allocate virtual arrays. */ + (*cinfo->mem->realize_virt_arrays) ((j_common_ptr) cinfo); + + /* Write the datastream header (SOI, JFIF) immediately. + * Frame and scan headers are postponed till later. + * This lets application insert special markers after the SOI. + */ + (*cinfo->marker->write_file_header) (cinfo); +} + + +/* + * The rest of this file is a special implementation of the coefficient + * buffer controller. This is similar to jccoefct.c, but it handles only + * output from presupplied virtual arrays. Furthermore, we generate any + * dummy padding blocks on-the-fly rather than expecting them to be present + * in the arrays. + */ + +/* Private buffer controller object */ + +typedef struct { + struct jpeg_c_coef_controller pub; /* public fields */ + + JDIMENSION iMCU_row_num; /* iMCU row # within image */ + JDIMENSION mcu_ctr; /* counts MCUs processed in current row */ + int MCU_vert_offset; /* counts MCU rows within iMCU row */ + int MCU_rows_per_iMCU_row; /* number of such rows needed */ + + /* Virtual block array for each component. */ + jvirt_barray_ptr *whole_image; + + /* Workspace for constructing dummy blocks at right/bottom edges. */ + JBLOCKROW dummy_buffer[C_MAX_BLOCKS_IN_MCU]; +} my_coef_controller; + +typedef my_coef_controller *my_coef_ptr; + + +LOCAL(void) +start_iMCU_row (j_compress_ptr cinfo) +/* Reset within-iMCU-row counters for a new row */ +{ + my_coef_ptr coef = (my_coef_ptr) cinfo->coef; + + /* In an interleaved scan, an MCU row is the same as an iMCU row. + * In a noninterleaved scan, an iMCU row has v_samp_factor MCU rows. + * But at the bottom of the image, process only what's left. + */ + if (cinfo->comps_in_scan > 1) { + coef->MCU_rows_per_iMCU_row = 1; + } else { + if (coef->iMCU_row_num < (cinfo->total_iMCU_rows-1)) + coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->v_samp_factor; + else + coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height; + } + + coef->mcu_ctr = 0; + coef->MCU_vert_offset = 0; +} + + +/* + * Initialize for a processing pass. + */ + +METHODDEF(void) +start_pass_coef (j_compress_ptr cinfo, J_BUF_MODE pass_mode) +{ + my_coef_ptr coef = (my_coef_ptr) cinfo->coef; + + if (pass_mode != JBUF_CRANK_DEST) + ERREXIT(cinfo, JERR_BAD_BUFFER_MODE); + + coef->iMCU_row_num = 0; + start_iMCU_row(cinfo); +} + + +/* + * Process some data. + * We process the equivalent of one fully interleaved MCU row ("iMCU" row) + * per call, ie, v_samp_factor block rows for each component in the scan. + * The data is obtained from the virtual arrays and fed to the entropy coder. + * Returns TRUE if the iMCU row is completed, FALSE if suspended. + * + * NB: input_buf is ignored; it is likely to be a NULL pointer. + */ + +METHODDEF(boolean) +compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf) +{ + my_coef_ptr coef = (my_coef_ptr) cinfo->coef; + JDIMENSION MCU_col_num; /* index of current MCU within row */ + JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1; + JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1; + int blkn, ci, xindex, yindex, yoffset, blockcnt; + JDIMENSION start_col; + JBLOCKARRAY buffer[MAX_COMPS_IN_SCAN]; + JBLOCKROW MCU_buffer[C_MAX_BLOCKS_IN_MCU]; + JBLOCKROW buffer_ptr; + jpeg_component_info *compptr; + + /* Align the virtual buffers for the components used in this scan. */ + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { + compptr = cinfo->cur_comp_info[ci]; + buffer[ci] = (*cinfo->mem->access_virt_barray) + ((j_common_ptr) cinfo, coef->whole_image[compptr->component_index], + coef->iMCU_row_num * compptr->v_samp_factor, + (JDIMENSION) compptr->v_samp_factor, FALSE); + } + + /* Loop to process one whole iMCU row */ + for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row; + yoffset++) { + for (MCU_col_num = coef->mcu_ctr; MCU_col_num < cinfo->MCUs_per_row; + MCU_col_num++) { + /* Construct list of pointers to DCT blocks belonging to this MCU */ + blkn = 0; /* index of current DCT block within MCU */ + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { + compptr = cinfo->cur_comp_info[ci]; + start_col = MCU_col_num * compptr->MCU_width; + blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width + : compptr->last_col_width; + for (yindex = 0; yindex < compptr->MCU_height; yindex++) { + if (coef->iMCU_row_num < last_iMCU_row || + yindex+yoffset < compptr->last_row_height) { + /* Fill in pointers to real blocks in this row */ + buffer_ptr = buffer[ci][yindex+yoffset] + start_col; + for (xindex = 0; xindex < blockcnt; xindex++) + MCU_buffer[blkn++] = buffer_ptr++; + } else { + /* At bottom of image, need a whole row of dummy blocks */ + xindex = 0; + } + /* Fill in any dummy blocks needed in this row. + * Dummy blocks are filled in the same way as in jccoefct.c: + * all zeroes in the AC entries, DC entries equal to previous + * block's DC value. The init routine has already zeroed the + * AC entries, so we need only set the DC entries correctly. + */ + for (; xindex < compptr->MCU_width; xindex++) { + MCU_buffer[blkn] = coef->dummy_buffer[blkn]; + MCU_buffer[blkn][0][0] = MCU_buffer[blkn-1][0][0]; + blkn++; + } + } + } + /* Try to write the MCU. */ + if (! (*cinfo->entropy->encode_mcu) (cinfo, MCU_buffer)) { + /* Suspension forced; update state counters and exit */ + coef->MCU_vert_offset = yoffset; + coef->mcu_ctr = MCU_col_num; + return FALSE; + } + } + /* Completed an MCU row, but perhaps not an iMCU row */ + coef->mcu_ctr = 0; + } + /* Completed the iMCU row, advance counters for next one */ + coef->iMCU_row_num++; + start_iMCU_row(cinfo); + return TRUE; +} + + +/* + * Initialize coefficient buffer controller. + * + * Each passed coefficient array must be the right size for that + * coefficient: width_in_blocks wide and height_in_blocks high, + * with unitheight at least v_samp_factor. + */ + +LOCAL(void) +transencode_coef_controller (j_compress_ptr cinfo, + jvirt_barray_ptr *coef_arrays) +{ + my_coef_ptr coef; + JBLOCKROW buffer; + int i; + + coef = (my_coef_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(my_coef_controller)); + cinfo->coef = (struct jpeg_c_coef_controller *) coef; + coef->pub.start_pass = start_pass_coef; + coef->pub.compress_data = compress_output; + + /* Save pointer to virtual arrays */ + coef->whole_image = coef_arrays; + + /* Allocate and pre-zero space for dummy DCT blocks. */ + buffer = (JBLOCKROW) + (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE, + C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK)); + jzero_far((void *) buffer, C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK)); + for (i = 0; i < C_MAX_BLOCKS_IN_MCU; i++) { + coef->dummy_buffer[i] = buffer + i; + } +} diff --git a/media/libjpeg/jdapimin.c b/media/libjpeg/jdapimin.c new file mode 100644 index 000000000..f80a14667 --- /dev/null +++ b/media/libjpeg/jdapimin.c @@ -0,0 +1,407 @@ +/* + * jdapimin.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1994-1998, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2016, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains application interface code for the decompression half + * of the JPEG library. These are the "minimum" API routines that may be + * needed in either the normal full-decompression case or the + * transcoding-only case. + * + * Most of the routines intended to be called directly by an application + * are in this file or in jdapistd.c. But also see jcomapi.c for routines + * shared by compression and decompression, and jdtrans.c for the transcoding + * case. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jdmaster.h" + + +/* + * Initialization of a JPEG decompression object. + * The error manager must already be set up (in case memory manager fails). + */ + +GLOBAL(void) +jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize) +{ + int i; + + /* Guard against version mismatches between library and caller. */ + cinfo->mem = NULL; /* so jpeg_destroy knows mem mgr not called */ + if (version != JPEG_LIB_VERSION) + ERREXIT2(cinfo, JERR_BAD_LIB_VERSION, JPEG_LIB_VERSION, version); + if (structsize != sizeof(struct jpeg_decompress_struct)) + ERREXIT2(cinfo, JERR_BAD_STRUCT_SIZE, + (int) sizeof(struct jpeg_decompress_struct), (int) structsize); + + /* For debugging purposes, we zero the whole master structure. + * But the application has already set the err pointer, and may have set + * client_data, so we have to save and restore those fields. + * Note: if application hasn't set client_data, tools like Purify may + * complain here. + */ + { + struct jpeg_error_mgr * err = cinfo->err; + void * client_data = cinfo->client_data; /* ignore Purify complaint here */ + MEMZERO(cinfo, sizeof(struct jpeg_decompress_struct)); + cinfo->err = err; + cinfo->client_data = client_data; + } + cinfo->is_decompressor = TRUE; + + /* Initialize a memory manager instance for this object */ + jinit_memory_mgr((j_common_ptr) cinfo); + + /* Zero out pointers to permanent structures. */ + cinfo->progress = NULL; + cinfo->src = NULL; + + for (i = 0; i < NUM_QUANT_TBLS; i++) + cinfo->quant_tbl_ptrs[i] = NULL; + + for (i = 0; i < NUM_HUFF_TBLS; i++) { + cinfo->dc_huff_tbl_ptrs[i] = NULL; + cinfo->ac_huff_tbl_ptrs[i] = NULL; + } + + /* Initialize marker processor so application can override methods + * for COM, APPn markers before calling jpeg_read_header. + */ + cinfo->marker_list = NULL; + jinit_marker_reader(cinfo); + + /* And initialize the overall input controller. */ + jinit_input_controller(cinfo); + + /* OK, I'm ready */ + cinfo->global_state = DSTATE_START; + + /* The master struct is used to store extension parameters, so we allocate it + * here. + */ + cinfo->master = (struct jpeg_decomp_master *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT, + sizeof(my_decomp_master)); + MEMZERO(cinfo->master, sizeof(my_decomp_master)); +} + + +/* + * Destruction of a JPEG decompression object + */ + +GLOBAL(void) +jpeg_destroy_decompress (j_decompress_ptr cinfo) +{ + jpeg_destroy((j_common_ptr) cinfo); /* use common routine */ +} + + +/* + * Abort processing of a JPEG decompression operation, + * but don't destroy the object itself. + */ + +GLOBAL(void) +jpeg_abort_decompress (j_decompress_ptr cinfo) +{ + jpeg_abort((j_common_ptr) cinfo); /* use common routine */ +} + + +/* + * Set default decompression parameters. + */ + +LOCAL(void) +default_decompress_parms (j_decompress_ptr cinfo) +{ + /* Guess the input colorspace, and set output colorspace accordingly. */ + /* (Wish JPEG committee had provided a real way to specify this...) */ + /* Note application may override our guesses. */ + switch (cinfo->num_components) { + case 1: + cinfo->jpeg_color_space = JCS_GRAYSCALE; + cinfo->out_color_space = JCS_GRAYSCALE; + break; + + case 3: + if (cinfo->saw_JFIF_marker) { + cinfo->jpeg_color_space = JCS_YCbCr; /* JFIF implies YCbCr */ + } else if (cinfo->saw_Adobe_marker) { + switch (cinfo->Adobe_transform) { + case 0: + cinfo->jpeg_color_space = JCS_RGB; + break; + case 1: + cinfo->jpeg_color_space = JCS_YCbCr; + break; + default: + WARNMS1(cinfo, JWRN_ADOBE_XFORM, cinfo->Adobe_transform); + cinfo->jpeg_color_space = JCS_YCbCr; /* assume it's YCbCr */ + break; + } + } else { + /* Saw no special markers, try to guess from the component IDs */ + int cid0 = cinfo->comp_info[0].component_id; + int cid1 = cinfo->comp_info[1].component_id; + int cid2 = cinfo->comp_info[2].component_id; + + if (cid0 == 1 && cid1 == 2 && cid2 == 3) + cinfo->jpeg_color_space = JCS_YCbCr; /* assume JFIF w/out marker */ + else if (cid0 == 82 && cid1 == 71 && cid2 == 66) + cinfo->jpeg_color_space = JCS_RGB; /* ASCII 'R', 'G', 'B' */ + else { + TRACEMS3(cinfo, 1, JTRC_UNKNOWN_IDS, cid0, cid1, cid2); + cinfo->jpeg_color_space = JCS_YCbCr; /* assume it's YCbCr */ + } + } + /* Always guess RGB is proper output colorspace. */ + cinfo->out_color_space = JCS_RGB; + break; + + case 4: + if (cinfo->saw_Adobe_marker) { + switch (cinfo->Adobe_transform) { + case 0: + cinfo->jpeg_color_space = JCS_CMYK; + break; + case 2: + cinfo->jpeg_color_space = JCS_YCCK; + break; + default: + WARNMS1(cinfo, JWRN_ADOBE_XFORM, cinfo->Adobe_transform); + cinfo->jpeg_color_space = JCS_YCCK; /* assume it's YCCK */ + break; + } + } else { + /* No special markers, assume straight CMYK. */ + cinfo->jpeg_color_space = JCS_CMYK; + } + cinfo->out_color_space = JCS_CMYK; + break; + + default: + cinfo->jpeg_color_space = JCS_UNKNOWN; + cinfo->out_color_space = JCS_UNKNOWN; + break; + } + + /* Set defaults for other decompression parameters. */ + cinfo->scale_num = 1; /* 1:1 scaling */ + cinfo->scale_denom = 1; + cinfo->output_gamma = 1.0; + cinfo->buffered_image = FALSE; + cinfo->raw_data_out = FALSE; + cinfo->dct_method = JDCT_DEFAULT; + cinfo->do_fancy_upsampling = TRUE; + cinfo->do_block_smoothing = TRUE; + cinfo->quantize_colors = FALSE; + /* We set these in case application only sets quantize_colors. */ + cinfo->dither_mode = JDITHER_FS; +#ifdef QUANT_2PASS_SUPPORTED + cinfo->two_pass_quantize = TRUE; +#else + cinfo->two_pass_quantize = FALSE; +#endif + cinfo->desired_number_of_colors = 256; + cinfo->colormap = NULL; + /* Initialize for no mode change in buffered-image mode. */ + cinfo->enable_1pass_quant = FALSE; + cinfo->enable_external_quant = FALSE; + cinfo->enable_2pass_quant = FALSE; +} + + +/* + * Decompression startup: read start of JPEG datastream to see what's there. + * Need only initialize JPEG object and supply a data source before calling. + * + * This routine will read as far as the first SOS marker (ie, actual start of + * compressed data), and will save all tables and parameters in the JPEG + * object. It will also initialize the decompression parameters to default + * values, and finally return JPEG_HEADER_OK. On return, the application may + * adjust the decompression parameters and then call jpeg_start_decompress. + * (Or, if the application only wanted to determine the image parameters, + * the data need not be decompressed. In that case, call jpeg_abort or + * jpeg_destroy to release any temporary space.) + * If an abbreviated (tables only) datastream is presented, the routine will + * return JPEG_HEADER_TABLES_ONLY upon reaching EOI. The application may then + * re-use the JPEG object to read the abbreviated image datastream(s). + * It is unnecessary (but OK) to call jpeg_abort in this case. + * The JPEG_SUSPENDED return code only occurs if the data source module + * requests suspension of the decompressor. In this case the application + * should load more source data and then re-call jpeg_read_header to resume + * processing. + * If a non-suspending data source is used and require_image is TRUE, then the + * return code need not be inspected since only JPEG_HEADER_OK is possible. + * + * This routine is now just a front end to jpeg_consume_input, with some + * extra error checking. + */ + +GLOBAL(int) +jpeg_read_header (j_decompress_ptr cinfo, boolean require_image) +{ + int retcode; + + if (cinfo->global_state != DSTATE_START && + cinfo->global_state != DSTATE_INHEADER) + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + + retcode = jpeg_consume_input(cinfo); + + switch (retcode) { + case JPEG_REACHED_SOS: + retcode = JPEG_HEADER_OK; + break; + case JPEG_REACHED_EOI: + if (require_image) /* Complain if application wanted an image */ + ERREXIT(cinfo, JERR_NO_IMAGE); + /* Reset to start state; it would be safer to require the application to + * call jpeg_abort, but we can't change it now for compatibility reasons. + * A side effect is to free any temporary memory (there shouldn't be any). + */ + jpeg_abort((j_common_ptr) cinfo); /* sets state = DSTATE_START */ + retcode = JPEG_HEADER_TABLES_ONLY; + break; + case JPEG_SUSPENDED: + /* no work */ + break; + } + + return retcode; +} + + +/* + * Consume data in advance of what the decompressor requires. + * This can be called at any time once the decompressor object has + * been created and a data source has been set up. + * + * This routine is essentially a state machine that handles a couple + * of critical state-transition actions, namely initial setup and + * transition from header scanning to ready-for-start_decompress. + * All the actual input is done via the input controller's consume_input + * method. + */ + +GLOBAL(int) +jpeg_consume_input (j_decompress_ptr cinfo) +{ + int retcode = JPEG_SUSPENDED; + + /* NB: every possible DSTATE value should be listed in this switch */ + switch (cinfo->global_state) { + case DSTATE_START: + /* Start-of-datastream actions: reset appropriate modules */ + (*cinfo->inputctl->reset_input_controller) (cinfo); + /* Initialize application's data source module */ + (*cinfo->src->init_source) (cinfo); + cinfo->global_state = DSTATE_INHEADER; + /*FALLTHROUGH*/ + case DSTATE_INHEADER: + retcode = (*cinfo->inputctl->consume_input) (cinfo); + if (retcode == JPEG_REACHED_SOS) { /* Found SOS, prepare to decompress */ + /* Set up default parameters based on header data */ + default_decompress_parms(cinfo); + /* Set global state: ready for start_decompress */ + cinfo->global_state = DSTATE_READY; + } + break; + case DSTATE_READY: + /* Can't advance past first SOS until start_decompress is called */ + retcode = JPEG_REACHED_SOS; + break; + case DSTATE_PRELOAD: + case DSTATE_PRESCAN: + case DSTATE_SCANNING: + case DSTATE_RAW_OK: + case DSTATE_BUFIMAGE: + case DSTATE_BUFPOST: + case DSTATE_STOPPING: + retcode = (*cinfo->inputctl->consume_input) (cinfo); + break; + default: + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + } + return retcode; +} + + +/* + * Have we finished reading the input file? + */ + +GLOBAL(boolean) +jpeg_input_complete (j_decompress_ptr cinfo) +{ + /* Check for valid jpeg object */ + if (cinfo->global_state < DSTATE_START || + cinfo->global_state > DSTATE_STOPPING) + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + return cinfo->inputctl->eoi_reached; +} + + +/* + * Is there more than one scan? + */ + +GLOBAL(boolean) +jpeg_has_multiple_scans (j_decompress_ptr cinfo) +{ + /* Only valid after jpeg_read_header completes */ + if (cinfo->global_state < DSTATE_READY || + cinfo->global_state > DSTATE_STOPPING) + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + return cinfo->inputctl->has_multiple_scans; +} + + +/* + * Finish JPEG decompression. + * + * This will normally just verify the file trailer and release temp storage. + * + * Returns FALSE if suspended. The return value need be inspected only if + * a suspending data source is used. + */ + +GLOBAL(boolean) +jpeg_finish_decompress (j_decompress_ptr cinfo) +{ + if ((cinfo->global_state == DSTATE_SCANNING || + cinfo->global_state == DSTATE_RAW_OK) && ! cinfo->buffered_image) { + /* Terminate final pass of non-buffered mode */ + if (cinfo->output_scanline < cinfo->output_height) + ERREXIT(cinfo, JERR_TOO_LITTLE_DATA); + (*cinfo->master->finish_output_pass) (cinfo); + cinfo->global_state = DSTATE_STOPPING; + } else if (cinfo->global_state == DSTATE_BUFIMAGE) { + /* Finishing after a buffered-image operation */ + cinfo->global_state = DSTATE_STOPPING; + } else if (cinfo->global_state != DSTATE_STOPPING) { + /* STOPPING = repeat call after a suspension, anything else is error */ + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + } + /* Read until EOI */ + while (! cinfo->inputctl->eoi_reached) { + if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED) + return FALSE; /* Suspend, come back later */ + } + /* Do final cleanup */ + (*cinfo->src->term_source) (cinfo); + /* We can use jpeg_abort to release memory and reset global_state */ + jpeg_abort((j_common_ptr) cinfo); + return TRUE; +} diff --git a/media/libjpeg/jdapistd.c b/media/libjpeg/jdapistd.c new file mode 100644 index 000000000..37afc8448 --- /dev/null +++ b/media/libjpeg/jdapistd.c @@ -0,0 +1,614 @@ +/* + * jdapistd.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1994-1996, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2010, 2015-2016, D. R. Commander. + * Copyright (C) 2015, Google, Inc. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains application interface code for the decompression half + * of the JPEG library. These are the "standard" API routines that are + * used in the normal full-decompression case. They are not used by a + * transcoding-only application. Note that if an application links in + * jpeg_start_decompress, it will end up linking in the entire decompressor. + * We thus must separate this file from jdapimin.c to avoid linking the + * whole decompression library into a transcoder. + */ + +#include "jinclude.h" +#include "jdmainct.h" +#include "jdcoefct.h" +#include "jdsample.h" +#include "jmemsys.h" + +/* Forward declarations */ +LOCAL(boolean) output_pass_setup (j_decompress_ptr cinfo); + + +/* + * Decompression initialization. + * jpeg_read_header must be completed before calling this. + * + * If a multipass operating mode was selected, this will do all but the + * last pass, and thus may take a great deal of time. + * + * Returns FALSE if suspended. The return value need be inspected only if + * a suspending data source is used. + */ + +GLOBAL(boolean) +jpeg_start_decompress (j_decompress_ptr cinfo) +{ + if (cinfo->global_state == DSTATE_READY) { + /* First call: initialize master control, select active modules */ + jinit_master_decompress(cinfo); + if (cinfo->buffered_image) { + /* No more work here; expecting jpeg_start_output next */ + cinfo->global_state = DSTATE_BUFIMAGE; + return TRUE; + } + cinfo->global_state = DSTATE_PRELOAD; + } + if (cinfo->global_state == DSTATE_PRELOAD) { + /* If file has multiple scans, absorb them all into the coef buffer */ + if (cinfo->inputctl->has_multiple_scans) { +#ifdef D_MULTISCAN_FILES_SUPPORTED + for (;;) { + int retcode; + /* Call progress monitor hook if present */ + if (cinfo->progress != NULL) + (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo); + /* Absorb some more input */ + retcode = (*cinfo->inputctl->consume_input) (cinfo); + if (retcode == JPEG_SUSPENDED) + return FALSE; + if (retcode == JPEG_REACHED_EOI) + break; + /* Advance progress counter if appropriate */ + if (cinfo->progress != NULL && + (retcode == JPEG_ROW_COMPLETED || retcode == JPEG_REACHED_SOS)) { + if (++cinfo->progress->pass_counter >= cinfo->progress->pass_limit) { + /* jdmaster underestimated number of scans; ratchet up one scan */ + cinfo->progress->pass_limit += (long) cinfo->total_iMCU_rows; + } + } + } +#else + ERREXIT(cinfo, JERR_NOT_COMPILED); +#endif /* D_MULTISCAN_FILES_SUPPORTED */ + } + cinfo->output_scan_number = cinfo->input_scan_number; + } else if (cinfo->global_state != DSTATE_PRESCAN) + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + /* Perform any dummy output passes, and set up for the final pass */ + return output_pass_setup(cinfo); +} + + +/* + * Set up for an output pass, and perform any dummy pass(es) needed. + * Common subroutine for jpeg_start_decompress and jpeg_start_output. + * Entry: global_state = DSTATE_PRESCAN only if previously suspended. + * Exit: If done, returns TRUE and sets global_state for proper output mode. + * If suspended, returns FALSE and sets global_state = DSTATE_PRESCAN. + */ + +LOCAL(boolean) +output_pass_setup (j_decompress_ptr cinfo) +{ + if (cinfo->global_state != DSTATE_PRESCAN) { + /* First call: do pass setup */ + (*cinfo->master->prepare_for_output_pass) (cinfo); + cinfo->output_scanline = 0; + cinfo->global_state = DSTATE_PRESCAN; + } + /* Loop over any required dummy passes */ + while (cinfo->master->is_dummy_pass) { +#ifdef QUANT_2PASS_SUPPORTED + /* Crank through the dummy pass */ + while (cinfo->output_scanline < cinfo->output_height) { + JDIMENSION last_scanline; + /* Call progress monitor hook if present */ + if (cinfo->progress != NULL) { + cinfo->progress->pass_counter = (long) cinfo->output_scanline; + cinfo->progress->pass_limit = (long) cinfo->output_height; + (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo); + } + /* Process some data */ + last_scanline = cinfo->output_scanline; + (*cinfo->main->process_data) (cinfo, (JSAMPARRAY) NULL, + &cinfo->output_scanline, (JDIMENSION) 0); + if (cinfo->output_scanline == last_scanline) + return FALSE; /* No progress made, must suspend */ + } + /* Finish up dummy pass, and set up for another one */ + (*cinfo->master->finish_output_pass) (cinfo); + (*cinfo->master->prepare_for_output_pass) (cinfo); + cinfo->output_scanline = 0; +#else + ERREXIT(cinfo, JERR_NOT_COMPILED); +#endif /* QUANT_2PASS_SUPPORTED */ + } + /* Ready for application to drive output pass through + * jpeg_read_scanlines or jpeg_read_raw_data. + */ + cinfo->global_state = cinfo->raw_data_out ? DSTATE_RAW_OK : DSTATE_SCANNING; + return TRUE; +} + + +/* + * Enable partial scanline decompression + * + * Must be called after jpeg_start_decompress() and before any calls to + * jpeg_read_scanlines() or jpeg_skip_scanlines(). + * + * Refer to libjpeg.txt for more information. + */ + +GLOBAL(void) +jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset, + JDIMENSION *width) +{ + int ci, align, orig_downsampled_width; + JDIMENSION input_xoffset; + boolean reinit_upsampler = FALSE; + jpeg_component_info *compptr; + + if (cinfo->global_state != DSTATE_SCANNING || cinfo->output_scanline != 0) + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + + if (!xoffset || !width) + ERREXIT(cinfo, JERR_BAD_CROP_SPEC); + + /* xoffset and width must fall within the output image dimensions. */ + if (*width == 0 || *xoffset + *width > cinfo->output_width) + ERREXIT(cinfo, JERR_WIDTH_OVERFLOW); + + /* No need to do anything if the caller wants the entire width. */ + if (*width == cinfo->output_width) + return; + + /* Ensuring the proper alignment of xoffset is tricky. At minimum, it + * must align with an MCU boundary, because: + * + * (1) The IDCT is performed in blocks, and it is not feasible to modify + * the algorithm so that it can transform partial blocks. + * (2) Because of the SIMD extensions, any input buffer passed to the + * upsampling and color conversion routines must be aligned to the + * SIMD word size (for instance, 128-bit in the case of SSE2.) The + * easiest way to accomplish this without copying data is to ensure + * that upsampling and color conversion begin at the start of the + * first MCU column that will be inverse transformed. + * + * In practice, we actually impose a stricter alignment requirement. We + * require that xoffset be a multiple of the maximum MCU column width of all + * of the components (the "iMCU column width.") This is to simplify the + * single-pass decompression case, allowing us to use the same MCU column + * width for all of the components. + */ + align = cinfo->_min_DCT_scaled_size * cinfo->max_h_samp_factor; + + /* Adjust xoffset to the nearest iMCU boundary <= the requested value */ + input_xoffset = *xoffset; + *xoffset = (input_xoffset / align) * align; + + /* Adjust the width so that the right edge of the output image is as + * requested (only the left edge is altered.) It is important that calling + * programs check this value after this function returns, so that they can + * allocate an output buffer with the appropriate size. + */ + *width = *width + input_xoffset - *xoffset; + cinfo->output_width = *width; + + /* Set the first and last iMCU columns that we must decompress. These values + * will be used in single-scan decompressions. + */ + cinfo->master->first_iMCU_col = + (JDIMENSION) (long) (*xoffset) / (long) align; + cinfo->master->last_iMCU_col = + (JDIMENSION) jdiv_round_up((long) (*xoffset + cinfo->output_width), + (long) align) - 1; + + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + /* Set downsampled_width to the new output width. */ + orig_downsampled_width = compptr->downsampled_width; + compptr->downsampled_width = + (JDIMENSION) jdiv_round_up((long) (cinfo->output_width * + compptr->h_samp_factor), + (long) cinfo->max_h_samp_factor); + if (compptr->downsampled_width < 2 && orig_downsampled_width >= 2) + reinit_upsampler = TRUE; + + /* Set the first and last iMCU columns that we must decompress. These + * values will be used in multi-scan decompressions. + */ + cinfo->master->first_MCU_col[ci] = + (JDIMENSION) (long) (*xoffset * compptr->h_samp_factor) / + (long) align; + cinfo->master->last_MCU_col[ci] = + (JDIMENSION) jdiv_round_up((long) ((*xoffset + cinfo->output_width) * + compptr->h_samp_factor), + (long) align) - 1; + } + + if (reinit_upsampler) { + cinfo->master->jinit_upsampler_no_alloc = TRUE; + jinit_upsampler(cinfo); + cinfo->master->jinit_upsampler_no_alloc = FALSE; + } +} + + +/* + * Read some scanlines of data from the JPEG decompressor. + * + * The return value will be the number of lines actually read. + * This may be less than the number requested in several cases, + * including bottom of image, data source suspension, and operating + * modes that emit multiple scanlines at a time. + * + * Note: we warn about excess calls to jpeg_read_scanlines() since + * this likely signals an application programmer error. However, + * an oversize buffer (max_lines > scanlines remaining) is not an error. + */ + +GLOBAL(JDIMENSION) +jpeg_read_scanlines (j_decompress_ptr cinfo, JSAMPARRAY scanlines, + JDIMENSION max_lines) +{ + JDIMENSION row_ctr; + + if (cinfo->global_state != DSTATE_SCANNING) + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + if (cinfo->output_scanline >= cinfo->output_height) { + WARNMS(cinfo, JWRN_TOO_MUCH_DATA); + return 0; + } + + /* Call progress monitor hook if present */ + if (cinfo->progress != NULL) { + cinfo->progress->pass_counter = (long) cinfo->output_scanline; + cinfo->progress->pass_limit = (long) cinfo->output_height; + (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo); + } + + /* Process some data */ + row_ctr = 0; + (*cinfo->main->process_data) (cinfo, scanlines, &row_ctr, max_lines); + cinfo->output_scanline += row_ctr; + return row_ctr; +} + + +/* Dummy color convert function used by jpeg_skip_scanlines() */ +LOCAL(void) +noop_convert (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows) +{ +} + + +/* + * In some cases, it is best to call jpeg_read_scanlines() and discard the + * output, rather than skipping the scanlines, because this allows us to + * maintain the internal state of the context-based upsampler. In these cases, + * we set up and tear down a dummy color converter in order to avoid valgrind + * errors and to achieve the best possible performance. + */ + +LOCAL(void) +read_and_discard_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines) +{ + JDIMENSION n; + void (*color_convert) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows); + + color_convert = cinfo->cconvert->color_convert; + cinfo->cconvert->color_convert = noop_convert; + + for (n = 0; n < num_lines; n++) + jpeg_read_scanlines(cinfo, NULL, 1); + + cinfo->cconvert->color_convert = color_convert; +} + + +/* + * Called by jpeg_skip_scanlines(). This partially skips a decompress block by + * incrementing the rowgroup counter. + */ + +LOCAL(void) +increment_simple_rowgroup_ctr (j_decompress_ptr cinfo, JDIMENSION rows) +{ + JDIMENSION rows_left; + my_main_ptr main_ptr = (my_main_ptr) cinfo->main; + + /* Increment the counter to the next row group after the skipped rows. */ + main_ptr->rowgroup_ctr += rows / cinfo->max_v_samp_factor; + + /* Partially skipping a row group would involve modifying the internal state + * of the upsampler, so read the remaining rows into a dummy buffer instead. + */ + rows_left = rows % cinfo->max_v_samp_factor; + cinfo->output_scanline += rows - rows_left; + + read_and_discard_scanlines(cinfo, rows_left); +} + +/* + * Skips some scanlines of data from the JPEG decompressor. + * + * The return value will be the number of lines actually skipped. If skipping + * num_lines would move beyond the end of the image, then the actual number of + * lines remaining in the image is returned. Otherwise, the return value will + * be equal to num_lines. + * + * Refer to libjpeg.txt for more information. + */ + +GLOBAL(JDIMENSION) +jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines) +{ + my_main_ptr main_ptr = (my_main_ptr) cinfo->main; + my_coef_ptr coef = (my_coef_ptr) cinfo->coef; + my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample; + JDIMENSION i, x; + int y; + JDIMENSION lines_per_iMCU_row, lines_left_in_iMCU_row, lines_after_iMCU_row; + JDIMENSION lines_to_skip, lines_to_read; + + if (cinfo->global_state != DSTATE_SCANNING) + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + + /* Do not skip past the bottom of the image. */ + if (cinfo->output_scanline + num_lines >= cinfo->output_height) { + cinfo->output_scanline = cinfo->output_height; + return cinfo->output_height - cinfo->output_scanline; + } + + if (num_lines == 0) + return 0; + + lines_per_iMCU_row = cinfo->_min_DCT_scaled_size * cinfo->max_v_samp_factor; + lines_left_in_iMCU_row = + (lines_per_iMCU_row - (cinfo->output_scanline % lines_per_iMCU_row)) % + lines_per_iMCU_row; + lines_after_iMCU_row = num_lines - lines_left_in_iMCU_row; + + /* Skip the lines remaining in the current iMCU row. When upsampling + * requires context rows, we need the previous and next rows in order to read + * the current row. This adds some complexity. + */ + if (cinfo->upsample->need_context_rows) { + /* If the skipped lines would not move us past the current iMCU row, we + * read the lines and ignore them. There might be a faster way of doing + * this, but we are facing increasing complexity for diminishing returns. + * The increasing complexity would be a by-product of meddling with the + * state machine used to skip context rows. Near the end of an iMCU row, + * the next iMCU row may have already been entropy-decoded. In this unique + * case, we will read the next iMCU row if we cannot skip past it as well. + */ + if ((num_lines < lines_left_in_iMCU_row + 1) || + (lines_left_in_iMCU_row <= 1 && main_ptr->buffer_full && + lines_after_iMCU_row < lines_per_iMCU_row + 1)) { + read_and_discard_scanlines(cinfo, num_lines); + return num_lines; + } + + /* If the next iMCU row has already been entropy-decoded, make sure that + * we do not skip too far. + */ + if (lines_left_in_iMCU_row <= 1 && main_ptr->buffer_full) { + cinfo->output_scanline += lines_left_in_iMCU_row + lines_per_iMCU_row; + lines_after_iMCU_row -= lines_per_iMCU_row; + } else { + cinfo->output_scanline += lines_left_in_iMCU_row; + } + + /* If we have just completed the first block, adjust the buffer pointers */ + if (main_ptr->iMCU_row_ctr == 0 || + (main_ptr->iMCU_row_ctr == 1 && lines_left_in_iMCU_row > 2)) + set_wraparound_pointers(cinfo); + main_ptr->buffer_full = FALSE; + main_ptr->rowgroup_ctr = 0; + main_ptr->context_state = CTX_PREPARE_FOR_IMCU; + upsample->next_row_out = cinfo->max_v_samp_factor; + upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline; + } + + /* Skipping is much simpler when context rows are not required. */ + else { + if (num_lines < lines_left_in_iMCU_row) { + increment_simple_rowgroup_ctr(cinfo, num_lines); + return num_lines; + } else { + cinfo->output_scanline += lines_left_in_iMCU_row; + main_ptr->buffer_full = FALSE; + main_ptr->rowgroup_ctr = 0; + upsample->next_row_out = cinfo->max_v_samp_factor; + upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline; + } + } + + /* Calculate how many full iMCU rows we can skip. */ + if (cinfo->upsample->need_context_rows) + lines_to_skip = ((lines_after_iMCU_row - 1) / lines_per_iMCU_row) * + lines_per_iMCU_row; + else + lines_to_skip = (lines_after_iMCU_row / lines_per_iMCU_row) * + lines_per_iMCU_row; + /* Calculate the number of lines that remain to be skipped after skipping all + * of the full iMCU rows that we can. We will not read these lines unless we + * have to. + */ + lines_to_read = lines_after_iMCU_row - lines_to_skip; + + /* For images requiring multiple scans (progressive, non-interleaved, etc.), + * all of the entropy decoding occurs in jpeg_start_decompress(), assuming + * that the input data source is non-suspending. This makes skipping easy. + */ + if (cinfo->inputctl->has_multiple_scans) { + if (cinfo->upsample->need_context_rows) { + cinfo->output_scanline += lines_to_skip; + cinfo->output_iMCU_row += lines_to_skip / lines_per_iMCU_row; + main_ptr->iMCU_row_ctr += lines_after_iMCU_row / lines_per_iMCU_row; + /* It is complex to properly move to the middle of a context block, so + * read the remaining lines instead of skipping them. + */ + read_and_discard_scanlines(cinfo, lines_to_read); + } else { + cinfo->output_scanline += lines_to_skip; + cinfo->output_iMCU_row += lines_to_skip / lines_per_iMCU_row; + increment_simple_rowgroup_ctr(cinfo, lines_to_read); + } + upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline; + return num_lines; + } + + /* Skip the iMCU rows that we can safely skip. */ + for (i = 0; i < lines_to_skip; i += lines_per_iMCU_row) { + for (y = 0; y < coef->MCU_rows_per_iMCU_row; y++) { + for (x = 0; x < cinfo->MCUs_per_row; x++) { + /* Calling decode_mcu() with a NULL pointer causes it to discard the + * decoded coefficients. This is ~5% faster for large subsets, but + * it's tough to tell a difference for smaller images. + */ + (*cinfo->entropy->decode_mcu) (cinfo, NULL); + } + } + cinfo->input_iMCU_row++; + cinfo->output_iMCU_row++; + if (cinfo->input_iMCU_row < cinfo->total_iMCU_rows) + start_iMCU_row(cinfo); + else + (*cinfo->inputctl->finish_input_pass) (cinfo); + } + cinfo->output_scanline += lines_to_skip; + + if (cinfo->upsample->need_context_rows) { + /* Context-based upsampling keeps track of iMCU rows. */ + main_ptr->iMCU_row_ctr += lines_to_skip / lines_per_iMCU_row; + + /* It is complex to properly move to the middle of a context block, so + * read the remaining lines instead of skipping them. + */ + read_and_discard_scanlines(cinfo, lines_to_read); + } else { + increment_simple_rowgroup_ctr(cinfo, lines_to_read); + } + + /* Since skipping lines involves skipping the upsampling step, the value of + * "rows_to_go" will become invalid unless we set it here. NOTE: This is a + * bit odd, since "rows_to_go" seems to be redundantly keeping track of + * output_scanline. + */ + upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline; + + /* Always skip the requested number of lines. */ + return num_lines; +} + +/* + * Alternate entry point to read raw data. + * Processes exactly one iMCU row per call, unless suspended. + */ + +GLOBAL(JDIMENSION) +jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data, + JDIMENSION max_lines) +{ + JDIMENSION lines_per_iMCU_row; + + if (cinfo->global_state != DSTATE_RAW_OK) + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + if (cinfo->output_scanline >= cinfo->output_height) { + WARNMS(cinfo, JWRN_TOO_MUCH_DATA); + return 0; + } + + /* Call progress monitor hook if present */ + if (cinfo->progress != NULL) { + cinfo->progress->pass_counter = (long) cinfo->output_scanline; + cinfo->progress->pass_limit = (long) cinfo->output_height; + (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo); + } + + /* Verify that at least one iMCU row can be returned. */ + lines_per_iMCU_row = cinfo->max_v_samp_factor * cinfo->_min_DCT_scaled_size; + if (max_lines < lines_per_iMCU_row) + ERREXIT(cinfo, JERR_BUFFER_SIZE); + + /* Decompress directly into user's buffer. */ + if (! (*cinfo->coef->decompress_data) (cinfo, data)) + return 0; /* suspension forced, can do nothing more */ + + /* OK, we processed one iMCU row. */ + cinfo->output_scanline += lines_per_iMCU_row; + return lines_per_iMCU_row; +} + + +/* Additional entry points for buffered-image mode. */ + +#ifdef D_MULTISCAN_FILES_SUPPORTED + +/* + * Initialize for an output pass in buffered-image mode. + */ + +GLOBAL(boolean) +jpeg_start_output (j_decompress_ptr cinfo, int scan_number) +{ + if (cinfo->global_state != DSTATE_BUFIMAGE && + cinfo->global_state != DSTATE_PRESCAN) + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + /* Limit scan number to valid range */ + if (scan_number <= 0) + scan_number = 1; + if (cinfo->inputctl->eoi_reached && + scan_number > cinfo->input_scan_number) + scan_number = cinfo->input_scan_number; + cinfo->output_scan_number = scan_number; + /* Perform any dummy output passes, and set up for the real pass */ + return output_pass_setup(cinfo); +} + + +/* + * Finish up after an output pass in buffered-image mode. + * + * Returns FALSE if suspended. The return value need be inspected only if + * a suspending data source is used. + */ + +GLOBAL(boolean) +jpeg_finish_output (j_decompress_ptr cinfo) +{ + if ((cinfo->global_state == DSTATE_SCANNING || + cinfo->global_state == DSTATE_RAW_OK) && cinfo->buffered_image) { + /* Terminate this pass. */ + /* We do not require the whole pass to have been completed. */ + (*cinfo->master->finish_output_pass) (cinfo); + cinfo->global_state = DSTATE_BUFPOST; + } else if (cinfo->global_state != DSTATE_BUFPOST) { + /* BUFPOST = repeat call after a suspension, anything else is error */ + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + } + /* Read markers looking for SOS or EOI */ + while (cinfo->input_scan_number <= cinfo->output_scan_number && + ! cinfo->inputctl->eoi_reached) { + if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED) + return FALSE; /* Suspend, come back later */ + } + cinfo->global_state = DSTATE_BUFIMAGE; + return TRUE; +} + +#endif /* D_MULTISCAN_FILES_SUPPORTED */ diff --git a/media/libjpeg/jdarith.c b/media/libjpeg/jdarith.c new file mode 100644 index 000000000..df3540eef --- /dev/null +++ b/media/libjpeg/jdarith.c @@ -0,0 +1,766 @@ +/* + * jdarith.c + * + * This file was part of the Independent JPEG Group's software: + * Developed 1997-2015 by Guido Vollbeding. + * libjpeg-turbo Modifications: + * Copyright (C) 2015-2016, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains portable arithmetic entropy decoding routines for JPEG + * (implementing the ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81). + * + * Both sequential and progressive modes are supported in this single module. + * + * Suspension is not currently supported in this module. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" + + +/* Expanded entropy decoder object for arithmetic decoding. */ + +typedef struct { + struct jpeg_entropy_decoder pub; /* public fields */ + + JLONG c; /* C register, base of coding interval + input bit buffer */ + JLONG a; /* A register, normalized size of coding interval */ + int ct; /* bit shift counter, # of bits left in bit buffer part of C */ + /* init: ct = -16 */ + /* run: ct = 0..7 */ + /* error: ct = -1 */ + int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */ + int dc_context[MAX_COMPS_IN_SCAN]; /* context index for DC conditioning */ + + unsigned int restarts_to_go; /* MCUs left in this restart interval */ + + /* Pointers to statistics areas (these workspaces have image lifespan) */ + unsigned char *dc_stats[NUM_ARITH_TBLS]; + unsigned char *ac_stats[NUM_ARITH_TBLS]; + + /* Statistics bin for coding with fixed probability 0.5 */ + unsigned char fixed_bin[4]; +} arith_entropy_decoder; + +typedef arith_entropy_decoder *arith_entropy_ptr; + +/* The following two definitions specify the allocation chunk size + * for the statistics area. + * According to sections F.1.4.4.1.3 and F.1.4.4.2, we need at least + * 49 statistics bins for DC, and 245 statistics bins for AC coding. + * + * We use a compact representation with 1 byte per statistics bin, + * thus the numbers directly represent byte sizes. + * This 1 byte per statistics bin contains the meaning of the MPS + * (more probable symbol) in the highest bit (mask 0x80), and the + * index into the probability estimation state machine table + * in the lower bits (mask 0x7F). + */ + +#define DC_STAT_BINS 64 +#define AC_STAT_BINS 256 + + +LOCAL(int) +get_byte (j_decompress_ptr cinfo) +/* Read next input byte; we do not support suspension in this module. */ +{ + struct jpeg_source_mgr *src = cinfo->src; + + if (src->bytes_in_buffer == 0) + if (! (*src->fill_input_buffer) (cinfo)) + ERREXIT(cinfo, JERR_CANT_SUSPEND); + src->bytes_in_buffer--; + return GETJOCTET(*src->next_input_byte++); +} + + +/* + * The core arithmetic decoding routine (common in JPEG and JBIG). + * This needs to go as fast as possible. + * Machine-dependent optimization facilities + * are not utilized in this portable implementation. + * However, this code should be fairly efficient and + * may be a good base for further optimizations anyway. + * + * Return value is 0 or 1 (binary decision). + * + * Note: I've changed the handling of the code base & bit + * buffer register C compared to other implementations + * based on the standards layout & procedures. + * While it also contains both the actual base of the + * coding interval (16 bits) and the next-bits buffer, + * the cut-point between these two parts is floating + * (instead of fixed) with the bit shift counter CT. + * Thus, we also need only one (variable instead of + * fixed size) shift for the LPS/MPS decision, and + * we can do away with any renormalization update + * of C (except for new data insertion, of course). + * + * I've also introduced a new scheme for accessing + * the probability estimation state machine table, + * derived from Markus Kuhn's JBIG implementation. + */ + +LOCAL(int) +arith_decode (j_decompress_ptr cinfo, unsigned char *st) +{ + register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy; + register unsigned char nl, nm; + register JLONG qe, temp; + register int sv, data; + + /* Renormalization & data input per section D.2.6 */ + while (e->a < 0x8000L) { + if (--e->ct < 0) { + /* Need to fetch next data byte */ + if (cinfo->unread_marker) + data = 0; /* stuff zero data */ + else { + data = get_byte(cinfo); /* read next input byte */ + if (data == 0xFF) { /* zero stuff or marker code */ + do data = get_byte(cinfo); + while (data == 0xFF); /* swallow extra 0xFF bytes */ + if (data == 0) + data = 0xFF; /* discard stuffed zero byte */ + else { + /* Note: Different from the Huffman decoder, hitting + * a marker while processing the compressed data + * segment is legal in arithmetic coding. + * The convention is to supply zero data + * then until decoding is complete. + */ + cinfo->unread_marker = data; + data = 0; + } + } + } + e->c = (e->c << 8) | data; /* insert data into C register */ + if ((e->ct += 8) < 0) /* update bit shift counter */ + /* Need more initial bytes */ + if (++e->ct == 0) + /* Got 2 initial bytes -> re-init A and exit loop */ + e->a = 0x8000L; /* => e->a = 0x10000L after loop exit */ + } + e->a <<= 1; + } + + /* Fetch values from our compact representation of Table D.2: + * Qe values and probability estimation state machine + */ + sv = *st; + qe = jpeg_aritab[sv & 0x7F]; /* => Qe_Value */ + nl = qe & 0xFF; qe >>= 8; /* Next_Index_LPS + Switch_MPS */ + nm = qe & 0xFF; qe >>= 8; /* Next_Index_MPS */ + + /* Decode & estimation procedures per sections D.2.4 & D.2.5 */ + temp = e->a - qe; + e->a = temp; + temp <<= e->ct; + if (e->c >= temp) { + e->c -= temp; + /* Conditional LPS (less probable symbol) exchange */ + if (e->a < qe) { + e->a = qe; + *st = (sv & 0x80) ^ nm; /* Estimate_after_MPS */ + } else { + e->a = qe; + *st = (sv & 0x80) ^ nl; /* Estimate_after_LPS */ + sv ^= 0x80; /* Exchange LPS/MPS */ + } + } else if (e->a < 0x8000L) { + /* Conditional MPS (more probable symbol) exchange */ + if (e->a < qe) { + *st = (sv & 0x80) ^ nl; /* Estimate_after_LPS */ + sv ^= 0x80; /* Exchange LPS/MPS */ + } else { + *st = (sv & 0x80) ^ nm; /* Estimate_after_MPS */ + } + } + + return sv >> 7; +} + + +/* + * Check for a restart marker & resynchronize decoder. + */ + +LOCAL(void) +process_restart (j_decompress_ptr cinfo) +{ + arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy; + int ci; + jpeg_component_info *compptr; + + /* Advance past the RSTn marker */ + if (! (*cinfo->marker->read_restart_marker) (cinfo)) + ERREXIT(cinfo, JERR_CANT_SUSPEND); + + /* Re-initialize statistics areas */ + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { + compptr = cinfo->cur_comp_info[ci]; + if (!cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) { + MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS); + /* Reset DC predictions to 0 */ + entropy->last_dc_val[ci] = 0; + entropy->dc_context[ci] = 0; + } + if (!cinfo->progressive_mode || cinfo->Ss) { + MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS); + } + } + + /* Reset arithmetic decoding variables */ + entropy->c = 0; + entropy->a = 0; + entropy->ct = -16; /* force reading 2 initial bytes to fill C */ + + /* Reset restart counter */ + entropy->restarts_to_go = cinfo->restart_interval; +} + + +/* + * Arithmetic MCU decoding. + * Each of these routines decodes and returns one MCU's worth of + * arithmetic-compressed coefficients. + * The coefficients are reordered from zigzag order into natural array order, + * but are not dequantized. + * + * The i'th block of the MCU is stored into the block pointed to by + * MCU_data[i]. WE ASSUME THIS AREA IS INITIALLY ZEROED BY THE CALLER. + */ + +/* + * MCU decoding for DC initial scan (either spectral selection, + * or first pass of successive approximation). + */ + +METHODDEF(boolean) +decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) +{ + arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy; + JBLOCKROW block; + unsigned char *st; + int blkn, ci, tbl, sign; + int v, m; + + /* Process restart marker if needed */ + if (cinfo->restart_interval) { + if (entropy->restarts_to_go == 0) + process_restart(cinfo); + entropy->restarts_to_go--; + } + + if (entropy->ct == -1) return TRUE; /* if error do nothing */ + + /* Outer loop handles each block in the MCU */ + + for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) { + block = MCU_data[blkn]; + ci = cinfo->MCU_membership[blkn]; + tbl = cinfo->cur_comp_info[ci]->dc_tbl_no; + + /* Sections F.2.4.1 & F.1.4.4.1: Decoding of DC coefficients */ + + /* Table F.4: Point to statistics bin S0 for DC coefficient coding */ + st = entropy->dc_stats[tbl] + entropy->dc_context[ci]; + + /* Figure F.19: Decode_DC_DIFF */ + if (arith_decode(cinfo, st) == 0) + entropy->dc_context[ci] = 0; + else { + /* Figure F.21: Decoding nonzero value v */ + /* Figure F.22: Decoding the sign of v */ + sign = arith_decode(cinfo, st + 1); + st += 2; st += sign; + /* Figure F.23: Decoding the magnitude category of v */ + if ((m = arith_decode(cinfo, st)) != 0) { + st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */ + while (arith_decode(cinfo, st)) { + if ((m <<= 1) == 0x8000) { + WARNMS(cinfo, JWRN_ARITH_BAD_CODE); + entropy->ct = -1; /* magnitude overflow */ + return TRUE; + } + st += 1; + } + } + /* Section F.1.4.4.1.2: Establish dc_context conditioning category */ + if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1)) + entropy->dc_context[ci] = 0; /* zero diff category */ + else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1)) + entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */ + else + entropy->dc_context[ci] = 4 + (sign * 4); /* small diff category */ + v = m; + /* Figure F.24: Decoding the magnitude bit pattern of v */ + st += 14; + while (m >>= 1) + if (arith_decode(cinfo, st)) v |= m; + v += 1; if (sign) v = -v; + entropy->last_dc_val[ci] += v; + } + + /* Scale and output the DC coefficient (assumes jpeg_natural_order[0]=0) */ + (*block)[0] = (JCOEF) LEFT_SHIFT(entropy->last_dc_val[ci], cinfo->Al); + } + + return TRUE; +} + + +/* + * MCU decoding for AC initial scan (either spectral selection, + * or first pass of successive approximation). + */ + +METHODDEF(boolean) +decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) +{ + arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy; + JBLOCKROW block; + unsigned char *st; + int tbl, sign, k; + int v, m; + + /* Process restart marker if needed */ + if (cinfo->restart_interval) { + if (entropy->restarts_to_go == 0) + process_restart(cinfo); + entropy->restarts_to_go--; + } + + if (entropy->ct == -1) return TRUE; /* if error do nothing */ + + /* There is always only one block per MCU */ + block = MCU_data[0]; + tbl = cinfo->cur_comp_info[0]->ac_tbl_no; + + /* Sections F.2.4.2 & F.1.4.4.2: Decoding of AC coefficients */ + + /* Figure F.20: Decode_AC_coefficients */ + for (k = cinfo->Ss; k <= cinfo->Se; k++) { + st = entropy->ac_stats[tbl] + 3 * (k - 1); + if (arith_decode(cinfo, st)) break; /* EOB flag */ + while (arith_decode(cinfo, st + 1) == 0) { + st += 3; k++; + if (k > cinfo->Se) { + WARNMS(cinfo, JWRN_ARITH_BAD_CODE); + entropy->ct = -1; /* spectral overflow */ + return TRUE; + } + } + /* Figure F.21: Decoding nonzero value v */ + /* Figure F.22: Decoding the sign of v */ + sign = arith_decode(cinfo, entropy->fixed_bin); + st += 2; + /* Figure F.23: Decoding the magnitude category of v */ + if ((m = arith_decode(cinfo, st)) != 0) { + if (arith_decode(cinfo, st)) { + m <<= 1; + st = entropy->ac_stats[tbl] + + (k <= cinfo->arith_ac_K[tbl] ? 189 : 217); + while (arith_decode(cinfo, st)) { + if ((m <<= 1) == 0x8000) { + WARNMS(cinfo, JWRN_ARITH_BAD_CODE); + entropy->ct = -1; /* magnitude overflow */ + return TRUE; + } + st += 1; + } + } + } + v = m; + /* Figure F.24: Decoding the magnitude bit pattern of v */ + st += 14; + while (m >>= 1) + if (arith_decode(cinfo, st)) v |= m; + v += 1; if (sign) v = -v; + /* Scale and output coefficient in natural (dezigzagged) order */ + (*block)[jpeg_natural_order[k]] = (JCOEF) ((unsigned)v << cinfo->Al); + } + + return TRUE; +} + + +/* + * MCU decoding for DC successive approximation refinement scan. + */ + +METHODDEF(boolean) +decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) +{ + arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy; + unsigned char *st; + int p1, blkn; + + /* Process restart marker if needed */ + if (cinfo->restart_interval) { + if (entropy->restarts_to_go == 0) + process_restart(cinfo); + entropy->restarts_to_go--; + } + + st = entropy->fixed_bin; /* use fixed probability estimation */ + p1 = 1 << cinfo->Al; /* 1 in the bit position being coded */ + + /* Outer loop handles each block in the MCU */ + + for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) { + /* Encoded data is simply the next bit of the two's-complement DC value */ + if (arith_decode(cinfo, st)) + MCU_data[blkn][0][0] |= p1; + } + + return TRUE; +} + + +/* + * MCU decoding for AC successive approximation refinement scan. + */ + +METHODDEF(boolean) +decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) +{ + arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy; + JBLOCKROW block; + JCOEFPTR thiscoef; + unsigned char *st; + int tbl, k, kex; + int p1, m1; + + /* Process restart marker if needed */ + if (cinfo->restart_interval) { + if (entropy->restarts_to_go == 0) + process_restart(cinfo); + entropy->restarts_to_go--; + } + + if (entropy->ct == -1) return TRUE; /* if error do nothing */ + + /* There is always only one block per MCU */ + block = MCU_data[0]; + tbl = cinfo->cur_comp_info[0]->ac_tbl_no; + + p1 = 1 << cinfo->Al; /* 1 in the bit position being coded */ + m1 = (-1) << cinfo->Al; /* -1 in the bit position being coded */ + + /* Establish EOBx (previous stage end-of-block) index */ + for (kex = cinfo->Se; kex > 0; kex--) + if ((*block)[jpeg_natural_order[kex]]) break; + + for (k = cinfo->Ss; k <= cinfo->Se; k++) { + st = entropy->ac_stats[tbl] + 3 * (k - 1); + if (k > kex) + if (arith_decode(cinfo, st)) break; /* EOB flag */ + for (;;) { + thiscoef = *block + jpeg_natural_order[k]; + if (*thiscoef) { /* previously nonzero coef */ + if (arith_decode(cinfo, st + 2)) { + if (*thiscoef < 0) + *thiscoef += m1; + else + *thiscoef += p1; + } + break; + } + if (arith_decode(cinfo, st + 1)) { /* newly nonzero coef */ + if (arith_decode(cinfo, entropy->fixed_bin)) + *thiscoef = m1; + else + *thiscoef = p1; + break; + } + st += 3; k++; + if (k > cinfo->Se) { + WARNMS(cinfo, JWRN_ARITH_BAD_CODE); + entropy->ct = -1; /* spectral overflow */ + return TRUE; + } + } + } + + return TRUE; +} + + +/* + * Decode one MCU's worth of arithmetic-compressed coefficients. + */ + +METHODDEF(boolean) +decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) +{ + arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy; + jpeg_component_info *compptr; + JBLOCKROW block; + unsigned char *st; + int blkn, ci, tbl, sign, k; + int v, m; + + /* Process restart marker if needed */ + if (cinfo->restart_interval) { + if (entropy->restarts_to_go == 0) + process_restart(cinfo); + entropy->restarts_to_go--; + } + + if (entropy->ct == -1) return TRUE; /* if error do nothing */ + + /* Outer loop handles each block in the MCU */ + + for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) { + block = MCU_data ? MCU_data[blkn] : NULL; + ci = cinfo->MCU_membership[blkn]; + compptr = cinfo->cur_comp_info[ci]; + + /* Sections F.2.4.1 & F.1.4.4.1: Decoding of DC coefficients */ + + tbl = compptr->dc_tbl_no; + + /* Table F.4: Point to statistics bin S0 for DC coefficient coding */ + st = entropy->dc_stats[tbl] + entropy->dc_context[ci]; + + /* Figure F.19: Decode_DC_DIFF */ + if (arith_decode(cinfo, st) == 0) + entropy->dc_context[ci] = 0; + else { + /* Figure F.21: Decoding nonzero value v */ + /* Figure F.22: Decoding the sign of v */ + sign = arith_decode(cinfo, st + 1); + st += 2; st += sign; + /* Figure F.23: Decoding the magnitude category of v */ + if ((m = arith_decode(cinfo, st)) != 0) { + st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */ + while (arith_decode(cinfo, st)) { + if ((m <<= 1) == 0x8000) { + WARNMS(cinfo, JWRN_ARITH_BAD_CODE); + entropy->ct = -1; /* magnitude overflow */ + return TRUE; + } + st += 1; + } + } + /* Section F.1.4.4.1.2: Establish dc_context conditioning category */ + if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1)) + entropy->dc_context[ci] = 0; /* zero diff category */ + else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1)) + entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */ + else + entropy->dc_context[ci] = 4 + (sign * 4); /* small diff category */ + v = m; + /* Figure F.24: Decoding the magnitude bit pattern of v */ + st += 14; + while (m >>= 1) + if (arith_decode(cinfo, st)) v |= m; + v += 1; if (sign) v = -v; + entropy->last_dc_val[ci] += v; + } + + if (block) + (*block)[0] = (JCOEF) entropy->last_dc_val[ci]; + + /* Sections F.2.4.2 & F.1.4.4.2: Decoding of AC coefficients */ + + tbl = compptr->ac_tbl_no; + + /* Figure F.20: Decode_AC_coefficients */ + for (k = 1; k <= DCTSIZE2 - 1; k++) { + st = entropy->ac_stats[tbl] + 3 * (k - 1); + if (arith_decode(cinfo, st)) break; /* EOB flag */ + while (arith_decode(cinfo, st + 1) == 0) { + st += 3; k++; + if (k > DCTSIZE2 - 1) { + WARNMS(cinfo, JWRN_ARITH_BAD_CODE); + entropy->ct = -1; /* spectral overflow */ + return TRUE; + } + } + /* Figure F.21: Decoding nonzero value v */ + /* Figure F.22: Decoding the sign of v */ + sign = arith_decode(cinfo, entropy->fixed_bin); + st += 2; + /* Figure F.23: Decoding the magnitude category of v */ + if ((m = arith_decode(cinfo, st)) != 0) { + if (arith_decode(cinfo, st)) { + m <<= 1; + st = entropy->ac_stats[tbl] + + (k <= cinfo->arith_ac_K[tbl] ? 189 : 217); + while (arith_decode(cinfo, st)) { + if ((m <<= 1) == 0x8000) { + WARNMS(cinfo, JWRN_ARITH_BAD_CODE); + entropy->ct = -1; /* magnitude overflow */ + return TRUE; + } + st += 1; + } + } + } + v = m; + /* Figure F.24: Decoding the magnitude bit pattern of v */ + st += 14; + while (m >>= 1) + if (arith_decode(cinfo, st)) v |= m; + v += 1; if (sign) v = -v; + if (block) + (*block)[jpeg_natural_order[k]] = (JCOEF) v; + } + } + + return TRUE; +} + + +/* + * Initialize for an arithmetic-compressed scan. + */ + +METHODDEF(void) +start_pass (j_decompress_ptr cinfo) +{ + arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy; + int ci, tbl; + jpeg_component_info *compptr; + + if (cinfo->progressive_mode) { + /* Validate progressive scan parameters */ + if (cinfo->Ss == 0) { + if (cinfo->Se != 0) + goto bad; + } else { + /* need not check Ss/Se < 0 since they came from unsigned bytes */ + if (cinfo->Se < cinfo->Ss || cinfo->Se > DCTSIZE2 - 1) + goto bad; + /* AC scans may have only one component */ + if (cinfo->comps_in_scan != 1) + goto bad; + } + if (cinfo->Ah != 0) { + /* Successive approximation refinement scan: must have Al = Ah-1. */ + if (cinfo->Ah-1 != cinfo->Al) + goto bad; + } + if (cinfo->Al > 13) { /* need not check for < 0 */ + bad: + ERREXIT4(cinfo, JERR_BAD_PROGRESSION, + cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al); + } + /* Update progression status, and verify that scan order is legal. + * Note that inter-scan inconsistencies are treated as warnings + * not fatal errors ... not clear if this is right way to behave. + */ + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { + int coefi, cindex = cinfo->cur_comp_info[ci]->component_index; + int *coef_bit_ptr = & cinfo->coef_bits[cindex][0]; + if (cinfo->Ss && coef_bit_ptr[0] < 0) /* AC without prior DC scan */ + WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0); + for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) { + int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi]; + if (cinfo->Ah != expected) + WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi); + coef_bit_ptr[coefi] = cinfo->Al; + } + } + /* Select MCU decoding routine */ + if (cinfo->Ah == 0) { + if (cinfo->Ss == 0) + entropy->pub.decode_mcu = decode_mcu_DC_first; + else + entropy->pub.decode_mcu = decode_mcu_AC_first; + } else { + if (cinfo->Ss == 0) + entropy->pub.decode_mcu = decode_mcu_DC_refine; + else + entropy->pub.decode_mcu = decode_mcu_AC_refine; + } + } else { + /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG. + * This ought to be an error condition, but we make it a warning. + */ + if (cinfo->Ss != 0 || cinfo->Ah != 0 || cinfo->Al != 0 || + (cinfo->Se < DCTSIZE2 && cinfo->Se != DCTSIZE2 - 1)) + WARNMS(cinfo, JWRN_NOT_SEQUENTIAL); + /* Select MCU decoding routine */ + entropy->pub.decode_mcu = decode_mcu; + } + + /* Allocate & initialize requested statistics areas */ + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { + compptr = cinfo->cur_comp_info[ci]; + if (!cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) { + tbl = compptr->dc_tbl_no; + if (tbl < 0 || tbl >= NUM_ARITH_TBLS) + ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl); + if (entropy->dc_stats[tbl] == NULL) + entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small) + ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS); + MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS); + /* Initialize DC predictions to 0 */ + entropy->last_dc_val[ci] = 0; + entropy->dc_context[ci] = 0; + } + if (!cinfo->progressive_mode || cinfo->Ss) { + tbl = compptr->ac_tbl_no; + if (tbl < 0 || tbl >= NUM_ARITH_TBLS) + ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl); + if (entropy->ac_stats[tbl] == NULL) + entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small) + ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS); + MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS); + } + } + + /* Initialize arithmetic decoding variables */ + entropy->c = 0; + entropy->a = 0; + entropy->ct = -16; /* force reading 2 initial bytes to fill C */ + + /* Initialize restart counter */ + entropy->restarts_to_go = cinfo->restart_interval; +} + + +/* + * Module initialization routine for arithmetic entropy decoding. + */ + +GLOBAL(void) +jinit_arith_decoder (j_decompress_ptr cinfo) +{ + arith_entropy_ptr entropy; + int i; + + entropy = (arith_entropy_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(arith_entropy_decoder)); + cinfo->entropy = (struct jpeg_entropy_decoder *) entropy; + entropy->pub.start_pass = start_pass; + + /* Mark tables unallocated */ + for (i = 0; i < NUM_ARITH_TBLS; i++) { + entropy->dc_stats[i] = NULL; + entropy->ac_stats[i] = NULL; + } + + /* Initialize index for fixed probability estimation */ + entropy->fixed_bin[0] = 113; + + if (cinfo->progressive_mode) { + /* Create progression status table */ + int *coef_bit_ptr, ci; + cinfo->coef_bits = (int (*)[DCTSIZE2]) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + cinfo->num_components*DCTSIZE2*sizeof(int)); + coef_bit_ptr = & cinfo->coef_bits[0][0]; + for (ci = 0; ci < cinfo->num_components; ci++) + for (i = 0; i < DCTSIZE2; i++) + *coef_bit_ptr++ = -1; + } +} diff --git a/media/libjpeg/jdatadst.c b/media/libjpeg/jdatadst.c new file mode 100644 index 000000000..dcaf6f0f9 --- /dev/null +++ b/media/libjpeg/jdatadst.c @@ -0,0 +1,293 @@ +/* + * jdatadst.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1994-1996, Thomas G. Lane. + * Modified 2009-2012 by Guido Vollbeding. + * libjpeg-turbo Modifications: + * Copyright (C) 2013, 2016, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains compression data destination routines for the case of + * emitting JPEG data to memory or to a file (or any stdio stream). + * While these routines are sufficient for most applications, + * some will want to use a different destination manager. + * IMPORTANT: we assume that fwrite() will correctly transcribe an array of + * JOCTETs into 8-bit-wide elements on external storage. If char is wider + * than 8 bits on your machine, you may need to do some tweaking. + */ + +/* this is not a core library module, so it doesn't define JPEG_INTERNALS */ +#include "jinclude.h" +#include "jpeglib.h" +#include "jerror.h" + +#ifndef HAVE_STDLIB_H /* <stdlib.h> should declare malloc(),free() */ +extern void *malloc (size_t size); +extern void free (void *ptr); +#endif + + +/* Expanded data destination object for stdio output */ + +typedef struct { + struct jpeg_destination_mgr pub; /* public fields */ + + FILE *outfile; /* target stream */ + JOCTET *buffer; /* start of buffer */ +} my_destination_mgr; + +typedef my_destination_mgr *my_dest_ptr; + +#define OUTPUT_BUF_SIZE 4096 /* choose an efficiently fwrite'able size */ + + +#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED) +/* Expanded data destination object for memory output */ + +typedef struct { + struct jpeg_destination_mgr pub; /* public fields */ + + unsigned char **outbuffer; /* target buffer */ + unsigned long *outsize; + unsigned char *newbuffer; /* newly allocated buffer */ + JOCTET *buffer; /* start of buffer */ + size_t bufsize; +} my_mem_destination_mgr; + +typedef my_mem_destination_mgr *my_mem_dest_ptr; +#endif + + +/* + * Initialize destination --- called by jpeg_start_compress + * before any data is actually written. + */ + +METHODDEF(void) +init_destination (j_compress_ptr cinfo) +{ + my_dest_ptr dest = (my_dest_ptr) cinfo->dest; + + /* Allocate the output buffer --- it will be released when done with image */ + dest->buffer = (JOCTET *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + OUTPUT_BUF_SIZE * sizeof(JOCTET)); + + dest->pub.next_output_byte = dest->buffer; + dest->pub.free_in_buffer = OUTPUT_BUF_SIZE; +} + +#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED) +METHODDEF(void) +init_mem_destination (j_compress_ptr cinfo) +{ + /* no work necessary here */ +} +#endif + + +/* + * Empty the output buffer --- called whenever buffer fills up. + * + * In typical applications, this should write the entire output buffer + * (ignoring the current state of next_output_byte & free_in_buffer), + * reset the pointer & count to the start of the buffer, and return TRUE + * indicating that the buffer has been dumped. + * + * In applications that need to be able to suspend compression due to output + * overrun, a FALSE return indicates that the buffer cannot be emptied now. + * In this situation, the compressor will return to its caller (possibly with + * an indication that it has not accepted all the supplied scanlines). The + * application should resume compression after it has made more room in the + * output buffer. Note that there are substantial restrictions on the use of + * suspension --- see the documentation. + * + * When suspending, the compressor will back up to a convenient restart point + * (typically the start of the current MCU). next_output_byte & free_in_buffer + * indicate where the restart point will be if the current call returns FALSE. + * Data beyond this point will be regenerated after resumption, so do not + * write it out when emptying the buffer externally. + */ + +METHODDEF(boolean) +empty_output_buffer (j_compress_ptr cinfo) +{ + my_dest_ptr dest = (my_dest_ptr) cinfo->dest; + + if (JFWRITE(dest->outfile, dest->buffer, OUTPUT_BUF_SIZE) != + (size_t) OUTPUT_BUF_SIZE) + ERREXIT(cinfo, JERR_FILE_WRITE); + + dest->pub.next_output_byte = dest->buffer; + dest->pub.free_in_buffer = OUTPUT_BUF_SIZE; + + return TRUE; +} + +#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED) +METHODDEF(boolean) +empty_mem_output_buffer (j_compress_ptr cinfo) +{ + size_t nextsize; + JOCTET *nextbuffer; + my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest; + + /* Try to allocate new buffer with double size */ + nextsize = dest->bufsize * 2; + nextbuffer = (JOCTET *) malloc(nextsize); + + if (nextbuffer == NULL) + ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10); + + MEMCOPY(nextbuffer, dest->buffer, dest->bufsize); + + if (dest->newbuffer != NULL) + free(dest->newbuffer); + + dest->newbuffer = nextbuffer; + + dest->pub.next_output_byte = nextbuffer + dest->bufsize; + dest->pub.free_in_buffer = dest->bufsize; + + dest->buffer = nextbuffer; + dest->bufsize = nextsize; + + return TRUE; +} +#endif + + +/* + * Terminate destination --- called by jpeg_finish_compress + * after all data has been written. Usually needs to flush buffer. + * + * NB: *not* called by jpeg_abort or jpeg_destroy; surrounding + * application must deal with any cleanup that should happen even + * for error exit. + */ + +METHODDEF(void) +term_destination (j_compress_ptr cinfo) +{ + my_dest_ptr dest = (my_dest_ptr) cinfo->dest; + size_t datacount = OUTPUT_BUF_SIZE - dest->pub.free_in_buffer; + + /* Write any data remaining in the buffer */ + if (datacount > 0) { + if (JFWRITE(dest->outfile, dest->buffer, datacount) != datacount) + ERREXIT(cinfo, JERR_FILE_WRITE); + } + fflush(dest->outfile); + /* Make sure we wrote the output file OK */ + if (ferror(dest->outfile)) + ERREXIT(cinfo, JERR_FILE_WRITE); +} + +#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED) +METHODDEF(void) +term_mem_destination (j_compress_ptr cinfo) +{ + my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest; + + *dest->outbuffer = dest->buffer; + *dest->outsize = (unsigned long)(dest->bufsize - dest->pub.free_in_buffer); +} +#endif + + +/* + * Prepare for output to a stdio stream. + * The caller must have already opened the stream, and is responsible + * for closing it after finishing compression. + */ + +GLOBAL(void) +jpeg_stdio_dest (j_compress_ptr cinfo, FILE *outfile) +{ + my_dest_ptr dest; + + /* The destination object is made permanent so that multiple JPEG images + * can be written to the same file without re-executing jpeg_stdio_dest. + */ + if (cinfo->dest == NULL) { /* first time for this JPEG object? */ + cinfo->dest = (struct jpeg_destination_mgr *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT, + sizeof(my_destination_mgr)); + } else if (cinfo->dest->init_destination != init_destination) { + /* It is unsafe to reuse the existing destination manager unless it was + * created by this function. Otherwise, there is no guarantee that the + * opaque structure is the right size. Note that we could just create a + * new structure, but the old structure would not be freed until + * jpeg_destroy_compress() was called. + */ + ERREXIT(cinfo, JERR_BUFFER_SIZE); + } + + dest = (my_dest_ptr) cinfo->dest; + dest->pub.init_destination = init_destination; + dest->pub.empty_output_buffer = empty_output_buffer; + dest->pub.term_destination = term_destination; + dest->outfile = outfile; +} + + +#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED) +/* + * Prepare for output to a memory buffer. + * The caller may supply an own initial buffer with appropriate size. + * Otherwise, or when the actual data output exceeds the given size, + * the library adapts the buffer size as necessary. + * The standard library functions malloc/free are used for allocating + * larger memory, so the buffer is available to the application after + * finishing compression, and then the application is responsible for + * freeing the requested memory. + * Note: An initial buffer supplied by the caller is expected to be + * managed by the application. The library does not free such buffer + * when allocating a larger buffer. + */ + +GLOBAL(void) +jpeg_mem_dest (j_compress_ptr cinfo, + unsigned char **outbuffer, unsigned long *outsize) +{ + my_mem_dest_ptr dest; + + if (outbuffer == NULL || outsize == NULL) /* sanity check */ + ERREXIT(cinfo, JERR_BUFFER_SIZE); + + /* The destination object is made permanent so that multiple JPEG images + * can be written to the same buffer without re-executing jpeg_mem_dest. + */ + if (cinfo->dest == NULL) { /* first time for this JPEG object? */ + cinfo->dest = (struct jpeg_destination_mgr *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT, + sizeof(my_mem_destination_mgr)); + } else if (cinfo->dest->init_destination != init_mem_destination) { + /* It is unsafe to reuse the existing destination manager unless it was + * created by this function. + */ + ERREXIT(cinfo, JERR_BUFFER_SIZE); + } + + dest = (my_mem_dest_ptr) cinfo->dest; + dest->pub.init_destination = init_mem_destination; + dest->pub.empty_output_buffer = empty_mem_output_buffer; + dest->pub.term_destination = term_mem_destination; + dest->outbuffer = outbuffer; + dest->outsize = outsize; + dest->newbuffer = NULL; + + if (*outbuffer == NULL || *outsize == 0) { + /* Allocate initial buffer */ + dest->newbuffer = *outbuffer = (unsigned char *) malloc(OUTPUT_BUF_SIZE); + if (dest->newbuffer == NULL) + ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10); + *outsize = OUTPUT_BUF_SIZE; + } + + dest->pub.next_output_byte = dest->buffer = *outbuffer; + dest->pub.free_in_buffer = dest->bufsize = *outsize; +} +#endif diff --git a/media/libjpeg/jdatasrc.c b/media/libjpeg/jdatasrc.c new file mode 100644 index 000000000..c83183fe1 --- /dev/null +++ b/media/libjpeg/jdatasrc.c @@ -0,0 +1,295 @@ +/* + * jdatasrc.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1994-1996, Thomas G. Lane. + * Modified 2009-2011 by Guido Vollbeding. + * libjpeg-turbo Modifications: + * Copyright (C) 2013, 2016, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains decompression data source routines for the case of + * reading JPEG data from memory or from a file (or any stdio stream). + * While these routines are sufficient for most applications, + * some will want to use a different source manager. + * IMPORTANT: we assume that fread() will correctly transcribe an array of + * JOCTETs from 8-bit-wide elements on external storage. If char is wider + * than 8 bits on your machine, you may need to do some tweaking. + */ + +/* this is not a core library module, so it doesn't define JPEG_INTERNALS */ +#include "jinclude.h" +#include "jpeglib.h" +#include "jerror.h" + + +/* Expanded data source object for stdio input */ + +typedef struct { + struct jpeg_source_mgr pub; /* public fields */ + + FILE *infile; /* source stream */ + JOCTET *buffer; /* start of buffer */ + boolean start_of_file; /* have we gotten any data yet? */ +} my_source_mgr; + +typedef my_source_mgr *my_src_ptr; + +#define INPUT_BUF_SIZE 4096 /* choose an efficiently fread'able size */ + + +/* + * Initialize source --- called by jpeg_read_header + * before any data is actually read. + */ + +METHODDEF(void) +init_source (j_decompress_ptr cinfo) +{ + my_src_ptr src = (my_src_ptr) cinfo->src; + + /* We reset the empty-input-file flag for each image, + * but we don't clear the input buffer. + * This is correct behavior for reading a series of images from one source. + */ + src->start_of_file = TRUE; +} + +#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED) +METHODDEF(void) +init_mem_source (j_decompress_ptr cinfo) +{ + /* no work necessary here */ +} +#endif + + +/* + * Fill the input buffer --- called whenever buffer is emptied. + * + * In typical applications, this should read fresh data into the buffer + * (ignoring the current state of next_input_byte & bytes_in_buffer), + * reset the pointer & count to the start of the buffer, and return TRUE + * indicating that the buffer has been reloaded. It is not necessary to + * fill the buffer entirely, only to obtain at least one more byte. + * + * There is no such thing as an EOF return. If the end of the file has been + * reached, the routine has a choice of ERREXIT() or inserting fake data into + * the buffer. In most cases, generating a warning message and inserting a + * fake EOI marker is the best course of action --- this will allow the + * decompressor to output however much of the image is there. However, + * the resulting error message is misleading if the real problem is an empty + * input file, so we handle that case specially. + * + * In applications that need to be able to suspend compression due to input + * not being available yet, a FALSE return indicates that no more data can be + * obtained right now, but more may be forthcoming later. In this situation, + * the decompressor will return to its caller (with an indication of the + * number of scanlines it has read, if any). The application should resume + * decompression after it has loaded more data into the input buffer. Note + * that there are substantial restrictions on the use of suspension --- see + * the documentation. + * + * When suspending, the decompressor will back up to a convenient restart point + * (typically the start of the current MCU). next_input_byte & bytes_in_buffer + * indicate where the restart point will be if the current call returns FALSE. + * Data beyond this point must be rescanned after resumption, so move it to + * the front of the buffer rather than discarding it. + */ + +METHODDEF(boolean) +fill_input_buffer (j_decompress_ptr cinfo) +{ + my_src_ptr src = (my_src_ptr) cinfo->src; + size_t nbytes; + + nbytes = JFREAD(src->infile, src->buffer, INPUT_BUF_SIZE); + + if (nbytes <= 0) { + if (src->start_of_file) /* Treat empty input file as fatal error */ + ERREXIT(cinfo, JERR_INPUT_EMPTY); + WARNMS(cinfo, JWRN_JPEG_EOF); + /* Insert a fake EOI marker */ + src->buffer[0] = (JOCTET) 0xFF; + src->buffer[1] = (JOCTET) JPEG_EOI; + nbytes = 2; + } + + src->pub.next_input_byte = src->buffer; + src->pub.bytes_in_buffer = nbytes; + src->start_of_file = FALSE; + + return TRUE; +} + +#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED) +METHODDEF(boolean) +fill_mem_input_buffer (j_decompress_ptr cinfo) +{ + static const JOCTET mybuffer[4] = { + (JOCTET) 0xFF, (JOCTET) JPEG_EOI, 0, 0 + }; + + /* The whole JPEG data is expected to reside in the supplied memory + * buffer, so any request for more data beyond the given buffer size + * is treated as an error. + */ + WARNMS(cinfo, JWRN_JPEG_EOF); + + /* Insert a fake EOI marker */ + + cinfo->src->next_input_byte = mybuffer; + cinfo->src->bytes_in_buffer = 2; + + return TRUE; +} +#endif + + +/* + * Skip data --- used to skip over a potentially large amount of + * uninteresting data (such as an APPn marker). + * + * Writers of suspendable-input applications must note that skip_input_data + * is not granted the right to give a suspension return. If the skip extends + * beyond the data currently in the buffer, the buffer can be marked empty so + * that the next read will cause a fill_input_buffer call that can suspend. + * Arranging for additional bytes to be discarded before reloading the input + * buffer is the application writer's problem. + */ + +METHODDEF(void) +skip_input_data (j_decompress_ptr cinfo, long num_bytes) +{ + struct jpeg_source_mgr *src = cinfo->src; + + /* Just a dumb implementation for now. Could use fseek() except + * it doesn't work on pipes. Not clear that being smart is worth + * any trouble anyway --- large skips are infrequent. + */ + if (num_bytes > 0) { + while (num_bytes > (long) src->bytes_in_buffer) { + num_bytes -= (long) src->bytes_in_buffer; + (void) (*src->fill_input_buffer) (cinfo); + /* note we assume that fill_input_buffer will never return FALSE, + * so suspension need not be handled. + */ + } + src->next_input_byte += (size_t) num_bytes; + src->bytes_in_buffer -= (size_t) num_bytes; + } +} + + +/* + * An additional method that can be provided by data source modules is the + * resync_to_restart method for error recovery in the presence of RST markers. + * For the moment, this source module just uses the default resync method + * provided by the JPEG library. That method assumes that no backtracking + * is possible. + */ + + +/* + * Terminate source --- called by jpeg_finish_decompress + * after all data has been read. Often a no-op. + * + * NB: *not* called by jpeg_abort or jpeg_destroy; surrounding + * application must deal with any cleanup that should happen even + * for error exit. + */ + +METHODDEF(void) +term_source (j_decompress_ptr cinfo) +{ + /* no work necessary here */ +} + + +/* + * Prepare for input from a stdio stream. + * The caller must have already opened the stream, and is responsible + * for closing it after finishing decompression. + */ + +GLOBAL(void) +jpeg_stdio_src (j_decompress_ptr cinfo, FILE *infile) +{ + my_src_ptr src; + + /* The source object and input buffer are made permanent so that a series + * of JPEG images can be read from the same file by calling jpeg_stdio_src + * only before the first one. (If we discarded the buffer at the end of + * one image, we'd likely lose the start of the next one.) + */ + if (cinfo->src == NULL) { /* first time for this JPEG object? */ + cinfo->src = (struct jpeg_source_mgr *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT, + sizeof(my_source_mgr)); + src = (my_src_ptr) cinfo->src; + src->buffer = (JOCTET *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT, + INPUT_BUF_SIZE * sizeof(JOCTET)); + } else if (cinfo->src->init_source != init_source) { + /* It is unsafe to reuse the existing source manager unless it was created + * by this function. Otherwise, there is no guarantee that the opaque + * structure is the right size. Note that we could just create a new + * structure, but the old structure would not be freed until + * jpeg_destroy_decompress() was called. + */ + ERREXIT(cinfo, JERR_BUFFER_SIZE); + } + + src = (my_src_ptr) cinfo->src; + src->pub.init_source = init_source; + src->pub.fill_input_buffer = fill_input_buffer; + src->pub.skip_input_data = skip_input_data; + src->pub.resync_to_restart = jpeg_resync_to_restart; /* use default method */ + src->pub.term_source = term_source; + src->infile = infile; + src->pub.bytes_in_buffer = 0; /* forces fill_input_buffer on first read */ + src->pub.next_input_byte = NULL; /* until buffer loaded */ +} + + +#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED) +/* + * Prepare for input from a supplied memory buffer. + * The buffer must contain the whole JPEG data. + */ + +GLOBAL(void) +jpeg_mem_src (j_decompress_ptr cinfo, + const unsigned char *inbuffer, unsigned long insize) +{ + struct jpeg_source_mgr *src; + + if (inbuffer == NULL || insize == 0) /* Treat empty input as fatal error */ + ERREXIT(cinfo, JERR_INPUT_EMPTY); + + /* The source object is made permanent so that a series of JPEG images + * can be read from the same buffer by calling jpeg_mem_src only before + * the first one. + */ + if (cinfo->src == NULL) { /* first time for this JPEG object? */ + cinfo->src = (struct jpeg_source_mgr *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT, + sizeof(struct jpeg_source_mgr)); + } else if (cinfo->src->init_source != init_mem_source) { + /* It is unsafe to reuse the existing source manager unless it was created + * by this function. + */ + ERREXIT(cinfo, JERR_BUFFER_SIZE); + } + + src = cinfo->src; + src->init_source = init_mem_source; + src->fill_input_buffer = fill_mem_input_buffer; + src->skip_input_data = skip_input_data; + src->resync_to_restart = jpeg_resync_to_restart; /* use default method */ + src->term_source = term_source; + src->bytes_in_buffer = (size_t) insize; + src->next_input_byte = (const JOCTET *) inbuffer; +} +#endif diff --git a/media/libjpeg/jdcoefct.c b/media/libjpeg/jdcoefct.c new file mode 100644 index 000000000..1a48969b8 --- /dev/null +++ b/media/libjpeg/jdcoefct.c @@ -0,0 +1,693 @@ +/* + * jdcoefct.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1994-1997, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2010, 2015-2016, D. R. Commander. + * Copyright (C) 2015, Google, Inc. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains the coefficient buffer controller for decompression. + * This controller is the top level of the JPEG decompressor proper. + * The coefficient buffer lies between entropy decoding and inverse-DCT steps. + * + * In buffered-image mode, this controller is the interface between + * input-oriented processing and output-oriented processing. + * Also, the input side (only) is used when reading a file for transcoding. + */ + +#include "jinclude.h" +#include "jdcoefct.h" +#include "jpegcomp.h" + + +/* Forward declarations */ +METHODDEF(int) decompress_onepass + (j_decompress_ptr cinfo, JSAMPIMAGE output_buf); +#ifdef D_MULTISCAN_FILES_SUPPORTED +METHODDEF(int) decompress_data + (j_decompress_ptr cinfo, JSAMPIMAGE output_buf); +#endif +#ifdef BLOCK_SMOOTHING_SUPPORTED +LOCAL(boolean) smoothing_ok (j_decompress_ptr cinfo); +METHODDEF(int) decompress_smooth_data + (j_decompress_ptr cinfo, JSAMPIMAGE output_buf); +#endif + + +/* + * Initialize for an input processing pass. + */ + +METHODDEF(void) +start_input_pass (j_decompress_ptr cinfo) +{ + cinfo->input_iMCU_row = 0; + start_iMCU_row(cinfo); +} + + +/* + * Initialize for an output processing pass. + */ + +METHODDEF(void) +start_output_pass (j_decompress_ptr cinfo) +{ +#ifdef BLOCK_SMOOTHING_SUPPORTED + my_coef_ptr coef = (my_coef_ptr) cinfo->coef; + + /* If multipass, check to see whether to use block smoothing on this pass */ + if (coef->pub.coef_arrays != NULL) { + if (cinfo->do_block_smoothing && smoothing_ok(cinfo)) + coef->pub.decompress_data = decompress_smooth_data; + else + coef->pub.decompress_data = decompress_data; + } +#endif + cinfo->output_iMCU_row = 0; +} + + +/* + * Decompress and return some data in the single-pass case. + * Always attempts to emit one fully interleaved MCU row ("iMCU" row). + * Input and output must run in lockstep since we have only a one-MCU buffer. + * Return value is JPEG_ROW_COMPLETED, JPEG_SCAN_COMPLETED, or JPEG_SUSPENDED. + * + * NB: output_buf contains a plane for each component in image, + * which we index according to the component's SOF position. + */ + +METHODDEF(int) +decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf) +{ + my_coef_ptr coef = (my_coef_ptr) cinfo->coef; + JDIMENSION MCU_col_num; /* index of current MCU within row */ + JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1; + JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1; + int blkn, ci, xindex, yindex, yoffset, useful_width; + JSAMPARRAY output_ptr; + JDIMENSION start_col, output_col; + jpeg_component_info *compptr; + inverse_DCT_method_ptr inverse_DCT; + + /* Loop to process as much as one whole iMCU row */ + for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row; + yoffset++) { + for (MCU_col_num = coef->MCU_ctr; MCU_col_num <= last_MCU_col; + MCU_col_num++) { + /* Try to fetch an MCU. Entropy decoder expects buffer to be zeroed. */ + jzero_far((void *) coef->MCU_buffer[0], + (size_t) (cinfo->blocks_in_MCU * sizeof(JBLOCK))); + if (! (*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) { + /* Suspension forced; update state counters and exit */ + coef->MCU_vert_offset = yoffset; + coef->MCU_ctr = MCU_col_num; + return JPEG_SUSPENDED; + } + + /* Only perform the IDCT on blocks that are contained within the desired + * cropping region. + */ + if (MCU_col_num >= cinfo->master->first_iMCU_col && + MCU_col_num <= cinfo->master->last_iMCU_col) { + /* Determine where data should go in output_buf and do the IDCT thing. + * We skip dummy blocks at the right and bottom edges (but blkn gets + * incremented past them!). Note the inner loop relies on having + * allocated the MCU_buffer[] blocks sequentially. + */ + blkn = 0; /* index of current DCT block within MCU */ + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { + compptr = cinfo->cur_comp_info[ci]; + /* Don't bother to IDCT an uninteresting component. */ + if (! compptr->component_needed) { + blkn += compptr->MCU_blocks; + continue; + } + inverse_DCT = cinfo->idct->inverse_DCT[compptr->component_index]; + useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width + : compptr->last_col_width; + output_ptr = output_buf[compptr->component_index] + + yoffset * compptr->_DCT_scaled_size; + start_col = (MCU_col_num - cinfo->master->first_iMCU_col) * + compptr->MCU_sample_width; + for (yindex = 0; yindex < compptr->MCU_height; yindex++) { + if (cinfo->input_iMCU_row < last_iMCU_row || + yoffset+yindex < compptr->last_row_height) { + output_col = start_col; + for (xindex = 0; xindex < useful_width; xindex++) { + (*inverse_DCT) (cinfo, compptr, + (JCOEFPTR) coef->MCU_buffer[blkn+xindex], + output_ptr, output_col); + output_col += compptr->_DCT_scaled_size; + } + } + blkn += compptr->MCU_width; + output_ptr += compptr->_DCT_scaled_size; + } + } + } + } + /* Completed an MCU row, but perhaps not an iMCU row */ + coef->MCU_ctr = 0; + } + /* Completed the iMCU row, advance counters for next one */ + cinfo->output_iMCU_row++; + if (++(cinfo->input_iMCU_row) < cinfo->total_iMCU_rows) { + start_iMCU_row(cinfo); + return JPEG_ROW_COMPLETED; + } + /* Completed the scan */ + (*cinfo->inputctl->finish_input_pass) (cinfo); + return JPEG_SCAN_COMPLETED; +} + + +/* + * Dummy consume-input routine for single-pass operation. + */ + +METHODDEF(int) +dummy_consume_data (j_decompress_ptr cinfo) +{ + return JPEG_SUSPENDED; /* Always indicate nothing was done */ +} + + +#ifdef D_MULTISCAN_FILES_SUPPORTED + +/* + * Consume input data and store it in the full-image coefficient buffer. + * We read as much as one fully interleaved MCU row ("iMCU" row) per call, + * ie, v_samp_factor block rows for each component in the scan. + * Return value is JPEG_ROW_COMPLETED, JPEG_SCAN_COMPLETED, or JPEG_SUSPENDED. + */ + +METHODDEF(int) +consume_data (j_decompress_ptr cinfo) +{ + my_coef_ptr coef = (my_coef_ptr) cinfo->coef; + JDIMENSION MCU_col_num; /* index of current MCU within row */ + int blkn, ci, xindex, yindex, yoffset; + JDIMENSION start_col; + JBLOCKARRAY buffer[MAX_COMPS_IN_SCAN]; + JBLOCKROW buffer_ptr; + jpeg_component_info *compptr; + + /* Align the virtual buffers for the components used in this scan. */ + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { + compptr = cinfo->cur_comp_info[ci]; + buffer[ci] = (*cinfo->mem->access_virt_barray) + ((j_common_ptr) cinfo, coef->whole_image[compptr->component_index], + cinfo->input_iMCU_row * compptr->v_samp_factor, + (JDIMENSION) compptr->v_samp_factor, TRUE); + /* Note: entropy decoder expects buffer to be zeroed, + * but this is handled automatically by the memory manager + * because we requested a pre-zeroed array. + */ + } + + /* Loop to process one whole iMCU row */ + for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row; + yoffset++) { + for (MCU_col_num = coef->MCU_ctr; MCU_col_num < cinfo->MCUs_per_row; + MCU_col_num++) { + /* Construct list of pointers to DCT blocks belonging to this MCU */ + blkn = 0; /* index of current DCT block within MCU */ + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { + compptr = cinfo->cur_comp_info[ci]; + start_col = MCU_col_num * compptr->MCU_width; + for (yindex = 0; yindex < compptr->MCU_height; yindex++) { + buffer_ptr = buffer[ci][yindex+yoffset] + start_col; + for (xindex = 0; xindex < compptr->MCU_width; xindex++) { + coef->MCU_buffer[blkn++] = buffer_ptr++; + } + } + } + /* Try to fetch the MCU. */ + if (! (*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) { + /* Suspension forced; update state counters and exit */ + coef->MCU_vert_offset = yoffset; + coef->MCU_ctr = MCU_col_num; + return JPEG_SUSPENDED; + } + } + /* Completed an MCU row, but perhaps not an iMCU row */ + coef->MCU_ctr = 0; + } + /* Completed the iMCU row, advance counters for next one */ + if (++(cinfo->input_iMCU_row) < cinfo->total_iMCU_rows) { + start_iMCU_row(cinfo); + return JPEG_ROW_COMPLETED; + } + /* Completed the scan */ + (*cinfo->inputctl->finish_input_pass) (cinfo); + return JPEG_SCAN_COMPLETED; +} + + +/* + * Decompress and return some data in the multi-pass case. + * Always attempts to emit one fully interleaved MCU row ("iMCU" row). + * Return value is JPEG_ROW_COMPLETED, JPEG_SCAN_COMPLETED, or JPEG_SUSPENDED. + * + * NB: output_buf contains a plane for each component in image. + */ + +METHODDEF(int) +decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf) +{ + my_coef_ptr coef = (my_coef_ptr) cinfo->coef; + JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1; + JDIMENSION block_num; + int ci, block_row, block_rows; + JBLOCKARRAY buffer; + JBLOCKROW buffer_ptr; + JSAMPARRAY output_ptr; + JDIMENSION output_col; + jpeg_component_info *compptr; + inverse_DCT_method_ptr inverse_DCT; + + /* Force some input to be done if we are getting ahead of the input. */ + while (cinfo->input_scan_number < cinfo->output_scan_number || + (cinfo->input_scan_number == cinfo->output_scan_number && + cinfo->input_iMCU_row <= cinfo->output_iMCU_row)) { + if ((*cinfo->inputctl->consume_input)(cinfo) == JPEG_SUSPENDED) + return JPEG_SUSPENDED; + } + + /* OK, output from the virtual arrays. */ + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + /* Don't bother to IDCT an uninteresting component. */ + if (! compptr->component_needed) + continue; + /* Align the virtual buffer for this component. */ + buffer = (*cinfo->mem->access_virt_barray) + ((j_common_ptr) cinfo, coef->whole_image[ci], + cinfo->output_iMCU_row * compptr->v_samp_factor, + (JDIMENSION) compptr->v_samp_factor, FALSE); + /* Count non-dummy DCT block rows in this iMCU row. */ + if (cinfo->output_iMCU_row < last_iMCU_row) + block_rows = compptr->v_samp_factor; + else { + /* NB: can't use last_row_height here; it is input-side-dependent! */ + block_rows = (int) (compptr->height_in_blocks % compptr->v_samp_factor); + if (block_rows == 0) block_rows = compptr->v_samp_factor; + } + inverse_DCT = cinfo->idct->inverse_DCT[ci]; + output_ptr = output_buf[ci]; + /* Loop over all DCT blocks to be processed. */ + for (block_row = 0; block_row < block_rows; block_row++) { + buffer_ptr = buffer[block_row] + cinfo->master->first_MCU_col[ci]; + output_col = 0; + for (block_num = cinfo->master->first_MCU_col[ci]; + block_num <= cinfo->master->last_MCU_col[ci]; block_num++) { + (*inverse_DCT) (cinfo, compptr, (JCOEFPTR) buffer_ptr, + output_ptr, output_col); + buffer_ptr++; + output_col += compptr->_DCT_scaled_size; + } + output_ptr += compptr->_DCT_scaled_size; + } + } + + if (++(cinfo->output_iMCU_row) < cinfo->total_iMCU_rows) + return JPEG_ROW_COMPLETED; + return JPEG_SCAN_COMPLETED; +} + +#endif /* D_MULTISCAN_FILES_SUPPORTED */ + + +#ifdef BLOCK_SMOOTHING_SUPPORTED + +/* + * This code applies interblock smoothing as described by section K.8 + * of the JPEG standard: the first 5 AC coefficients are estimated from + * the DC values of a DCT block and its 8 neighboring blocks. + * We apply smoothing only for progressive JPEG decoding, and only if + * the coefficients it can estimate are not yet known to full precision. + */ + +/* Natural-order array positions of the first 5 zigzag-order coefficients */ +#define Q01_POS 1 +#define Q10_POS 8 +#define Q20_POS 16 +#define Q11_POS 9 +#define Q02_POS 2 + +/* + * Determine whether block smoothing is applicable and safe. + * We also latch the current states of the coef_bits[] entries for the + * AC coefficients; otherwise, if the input side of the decompressor + * advances into a new scan, we might think the coefficients are known + * more accurately than they really are. + */ + +LOCAL(boolean) +smoothing_ok (j_decompress_ptr cinfo) +{ + my_coef_ptr coef = (my_coef_ptr) cinfo->coef; + boolean smoothing_useful = FALSE; + int ci, coefi; + jpeg_component_info *compptr; + JQUANT_TBL *qtable; + int *coef_bits; + int *coef_bits_latch; + + if (! cinfo->progressive_mode || cinfo->coef_bits == NULL) + return FALSE; + + /* Allocate latch area if not already done */ + if (coef->coef_bits_latch == NULL) + coef->coef_bits_latch = (int *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + cinfo->num_components * + (SAVED_COEFS * sizeof(int))); + coef_bits_latch = coef->coef_bits_latch; + + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + /* All components' quantization values must already be latched. */ + if ((qtable = compptr->quant_table) == NULL) + return FALSE; + /* Verify DC & first 5 AC quantizers are nonzero to avoid zero-divide. */ + if (qtable->quantval[0] == 0 || + qtable->quantval[Q01_POS] == 0 || + qtable->quantval[Q10_POS] == 0 || + qtable->quantval[Q20_POS] == 0 || + qtable->quantval[Q11_POS] == 0 || + qtable->quantval[Q02_POS] == 0) + return FALSE; + /* DC values must be at least partly known for all components. */ + coef_bits = cinfo->coef_bits[ci]; + if (coef_bits[0] < 0) + return FALSE; + /* Block smoothing is helpful if some AC coefficients remain inaccurate. */ + for (coefi = 1; coefi <= 5; coefi++) { + coef_bits_latch[coefi] = coef_bits[coefi]; + if (coef_bits[coefi] != 0) + smoothing_useful = TRUE; + } + coef_bits_latch += SAVED_COEFS; + } + + return smoothing_useful; +} + + +/* + * Variant of decompress_data for use when doing block smoothing. + */ + +METHODDEF(int) +decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf) +{ + my_coef_ptr coef = (my_coef_ptr) cinfo->coef; + JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1; + JDIMENSION block_num, last_block_column; + int ci, block_row, block_rows, access_rows; + JBLOCKARRAY buffer; + JBLOCKROW buffer_ptr, prev_block_row, next_block_row; + JSAMPARRAY output_ptr; + JDIMENSION output_col; + jpeg_component_info *compptr; + inverse_DCT_method_ptr inverse_DCT; + boolean first_row, last_row; + JCOEF *workspace; + int *coef_bits; + JQUANT_TBL *quanttbl; + JLONG Q00,Q01,Q02,Q10,Q11,Q20, num; + int DC1,DC2,DC3,DC4,DC5,DC6,DC7,DC8,DC9; + int Al, pred; + + /* Keep a local variable to avoid looking it up more than once */ + workspace = coef->workspace; + + /* Force some input to be done if we are getting ahead of the input. */ + while (cinfo->input_scan_number <= cinfo->output_scan_number && + ! cinfo->inputctl->eoi_reached) { + if (cinfo->input_scan_number == cinfo->output_scan_number) { + /* If input is working on current scan, we ordinarily want it to + * have completed the current row. But if input scan is DC, + * we want it to keep one row ahead so that next block row's DC + * values are up to date. + */ + JDIMENSION delta = (cinfo->Ss == 0) ? 1 : 0; + if (cinfo->input_iMCU_row > cinfo->output_iMCU_row+delta) + break; + } + if ((*cinfo->inputctl->consume_input)(cinfo) == JPEG_SUSPENDED) + return JPEG_SUSPENDED; + } + + /* OK, output from the virtual arrays. */ + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + /* Don't bother to IDCT an uninteresting component. */ + if (! compptr->component_needed) + continue; + /* Count non-dummy DCT block rows in this iMCU row. */ + if (cinfo->output_iMCU_row < last_iMCU_row) { + block_rows = compptr->v_samp_factor; + access_rows = block_rows * 2; /* this and next iMCU row */ + last_row = FALSE; + } else { + /* NB: can't use last_row_height here; it is input-side-dependent! */ + block_rows = (int) (compptr->height_in_blocks % compptr->v_samp_factor); + if (block_rows == 0) block_rows = compptr->v_samp_factor; + access_rows = block_rows; /* this iMCU row only */ + last_row = TRUE; + } + /* Align the virtual buffer for this component. */ + if (cinfo->output_iMCU_row > 0) { + access_rows += compptr->v_samp_factor; /* prior iMCU row too */ + buffer = (*cinfo->mem->access_virt_barray) + ((j_common_ptr) cinfo, coef->whole_image[ci], + (cinfo->output_iMCU_row - 1) * compptr->v_samp_factor, + (JDIMENSION) access_rows, FALSE); + buffer += compptr->v_samp_factor; /* point to current iMCU row */ + first_row = FALSE; + } else { + buffer = (*cinfo->mem->access_virt_barray) + ((j_common_ptr) cinfo, coef->whole_image[ci], + (JDIMENSION) 0, (JDIMENSION) access_rows, FALSE); + first_row = TRUE; + } + /* Fetch component-dependent info */ + coef_bits = coef->coef_bits_latch + (ci * SAVED_COEFS); + quanttbl = compptr->quant_table; + Q00 = quanttbl->quantval[0]; + Q01 = quanttbl->quantval[Q01_POS]; + Q10 = quanttbl->quantval[Q10_POS]; + Q20 = quanttbl->quantval[Q20_POS]; + Q11 = quanttbl->quantval[Q11_POS]; + Q02 = quanttbl->quantval[Q02_POS]; + inverse_DCT = cinfo->idct->inverse_DCT[ci]; + output_ptr = output_buf[ci]; + /* Loop over all DCT blocks to be processed. */ + for (block_row = 0; block_row < block_rows; block_row++) { + buffer_ptr = buffer[block_row] + cinfo->master->first_MCU_col[ci]; + if (first_row && block_row == 0) + prev_block_row = buffer_ptr; + else + prev_block_row = buffer[block_row-1]; + if (last_row && block_row == block_rows-1) + next_block_row = buffer_ptr; + else + next_block_row = buffer[block_row+1]; + /* We fetch the surrounding DC values using a sliding-register approach. + * Initialize all nine here so as to do the right thing on narrow pics. + */ + DC1 = DC2 = DC3 = (int) prev_block_row[0][0]; + DC4 = DC5 = DC6 = (int) buffer_ptr[0][0]; + DC7 = DC8 = DC9 = (int) next_block_row[0][0]; + output_col = 0; + last_block_column = compptr->width_in_blocks - 1; + for (block_num = cinfo->master->first_MCU_col[ci]; + block_num <= cinfo->master->last_MCU_col[ci]; block_num++) { + /* Fetch current DCT block into workspace so we can modify it. */ + jcopy_block_row(buffer_ptr, (JBLOCKROW) workspace, (JDIMENSION) 1); + /* Update DC values */ + if (block_num < last_block_column) { + DC3 = (int) prev_block_row[1][0]; + DC6 = (int) buffer_ptr[1][0]; + DC9 = (int) next_block_row[1][0]; + } + /* Compute coefficient estimates per K.8. + * An estimate is applied only if coefficient is still zero, + * and is not known to be fully accurate. + */ + /* AC01 */ + if ((Al=coef_bits[1]) != 0 && workspace[1] == 0) { + num = 36 * Q00 * (DC4 - DC6); + if (num >= 0) { + pred = (int) (((Q01<<7) + num) / (Q01<<8)); + if (Al > 0 && pred >= (1<<Al)) + pred = (1<<Al)-1; + } else { + pred = (int) (((Q01<<7) - num) / (Q01<<8)); + if (Al > 0 && pred >= (1<<Al)) + pred = (1<<Al)-1; + pred = -pred; + } + workspace[1] = (JCOEF) pred; + } + /* AC10 */ + if ((Al=coef_bits[2]) != 0 && workspace[8] == 0) { + num = 36 * Q00 * (DC2 - DC8); + if (num >= 0) { + pred = (int) (((Q10<<7) + num) / (Q10<<8)); + if (Al > 0 && pred >= (1<<Al)) + pred = (1<<Al)-1; + } else { + pred = (int) (((Q10<<7) - num) / (Q10<<8)); + if (Al > 0 && pred >= (1<<Al)) + pred = (1<<Al)-1; + pred = -pred; + } + workspace[8] = (JCOEF) pred; + } + /* AC20 */ + if ((Al=coef_bits[3]) != 0 && workspace[16] == 0) { + num = 9 * Q00 * (DC2 + DC8 - 2*DC5); + if (num >= 0) { + pred = (int) (((Q20<<7) + num) / (Q20<<8)); + if (Al > 0 && pred >= (1<<Al)) + pred = (1<<Al)-1; + } else { + pred = (int) (((Q20<<7) - num) / (Q20<<8)); + if (Al > 0 && pred >= (1<<Al)) + pred = (1<<Al)-1; + pred = -pred; + } + workspace[16] = (JCOEF) pred; + } + /* AC11 */ + if ((Al=coef_bits[4]) != 0 && workspace[9] == 0) { + num = 5 * Q00 * (DC1 - DC3 - DC7 + DC9); + if (num >= 0) { + pred = (int) (((Q11<<7) + num) / (Q11<<8)); + if (Al > 0 && pred >= (1<<Al)) + pred = (1<<Al)-1; + } else { + pred = (int) (((Q11<<7) - num) / (Q11<<8)); + if (Al > 0 && pred >= (1<<Al)) + pred = (1<<Al)-1; + pred = -pred; + } + workspace[9] = (JCOEF) pred; + } + /* AC02 */ + if ((Al=coef_bits[5]) != 0 && workspace[2] == 0) { + num = 9 * Q00 * (DC4 + DC6 - 2*DC5); + if (num >= 0) { + pred = (int) (((Q02<<7) + num) / (Q02<<8)); + if (Al > 0 && pred >= (1<<Al)) + pred = (1<<Al)-1; + } else { + pred = (int) (((Q02<<7) - num) / (Q02<<8)); + if (Al > 0 && pred >= (1<<Al)) + pred = (1<<Al)-1; + pred = -pred; + } + workspace[2] = (JCOEF) pred; + } + /* OK, do the IDCT */ + (*inverse_DCT) (cinfo, compptr, (JCOEFPTR) workspace, + output_ptr, output_col); + /* Advance for next column */ + DC1 = DC2; DC2 = DC3; + DC4 = DC5; DC5 = DC6; + DC7 = DC8; DC8 = DC9; + buffer_ptr++, prev_block_row++, next_block_row++; + output_col += compptr->_DCT_scaled_size; + } + output_ptr += compptr->_DCT_scaled_size; + } + } + + if (++(cinfo->output_iMCU_row) < cinfo->total_iMCU_rows) + return JPEG_ROW_COMPLETED; + return JPEG_SCAN_COMPLETED; +} + +#endif /* BLOCK_SMOOTHING_SUPPORTED */ + + +/* + * Initialize coefficient buffer controller. + */ + +GLOBAL(void) +jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer) +{ + my_coef_ptr coef; + + coef = (my_coef_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(my_coef_controller)); + cinfo->coef = (struct jpeg_d_coef_controller *) coef; + coef->pub.start_input_pass = start_input_pass; + coef->pub.start_output_pass = start_output_pass; +#ifdef BLOCK_SMOOTHING_SUPPORTED + coef->coef_bits_latch = NULL; +#endif + + /* Create the coefficient buffer. */ + if (need_full_buffer) { +#ifdef D_MULTISCAN_FILES_SUPPORTED + /* Allocate a full-image virtual array for each component, */ + /* padded to a multiple of samp_factor DCT blocks in each direction. */ + /* Note we ask for a pre-zeroed array. */ + int ci, access_rows; + jpeg_component_info *compptr; + + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + access_rows = compptr->v_samp_factor; +#ifdef BLOCK_SMOOTHING_SUPPORTED + /* If block smoothing could be used, need a bigger window */ + if (cinfo->progressive_mode) + access_rows *= 3; +#endif + coef->whole_image[ci] = (*cinfo->mem->request_virt_barray) + ((j_common_ptr) cinfo, JPOOL_IMAGE, TRUE, + (JDIMENSION) jround_up((long) compptr->width_in_blocks, + (long) compptr->h_samp_factor), + (JDIMENSION) jround_up((long) compptr->height_in_blocks, + (long) compptr->v_samp_factor), + (JDIMENSION) access_rows); + } + coef->pub.consume_data = consume_data; + coef->pub.decompress_data = decompress_data; + coef->pub.coef_arrays = coef->whole_image; /* link to virtual arrays */ +#else + ERREXIT(cinfo, JERR_NOT_COMPILED); +#endif + } else { + /* We only need a single-MCU buffer. */ + JBLOCKROW buffer; + int i; + + buffer = (JBLOCKROW) + (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE, + D_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK)); + for (i = 0; i < D_MAX_BLOCKS_IN_MCU; i++) { + coef->MCU_buffer[i] = buffer + i; + } + coef->pub.consume_data = dummy_consume_data; + coef->pub.decompress_data = decompress_onepass; + coef->pub.coef_arrays = NULL; /* flag for no virtual arrays */ + } + + /* Allocate the workspace buffer */ + coef->workspace = (JCOEF *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(JCOEF) * DCTSIZE2); +} diff --git a/media/libjpeg/jdcoefct.h b/media/libjpeg/jdcoefct.h new file mode 100644 index 000000000..bf6beb274 --- /dev/null +++ b/media/libjpeg/jdcoefct.h @@ -0,0 +1,82 @@ +/* + * jdcoefct.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1994-1997, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * For conditions of distribution and use, see the accompanying README.ijg + * file. + */ + +#define JPEG_INTERNALS +#include "jpeglib.h" + + +/* Block smoothing is only applicable for progressive JPEG, so: */ +#ifndef D_PROGRESSIVE_SUPPORTED +#undef BLOCK_SMOOTHING_SUPPORTED +#endif + + +/* Private buffer controller object */ + +typedef struct { + struct jpeg_d_coef_controller pub; /* public fields */ + + /* These variables keep track of the current location of the input side. */ + /* cinfo->input_iMCU_row is also used for this. */ + JDIMENSION MCU_ctr; /* counts MCUs processed in current row */ + int MCU_vert_offset; /* counts MCU rows within iMCU row */ + int MCU_rows_per_iMCU_row; /* number of such rows needed */ + + /* The output side's location is represented by cinfo->output_iMCU_row. */ + + /* In single-pass modes, it's sufficient to buffer just one MCU. + * We allocate a workspace of D_MAX_BLOCKS_IN_MCU coefficient blocks, + * and let the entropy decoder write into that workspace each time. + * In multi-pass modes, this array points to the current MCU's blocks + * within the virtual arrays; it is used only by the input side. + */ + JBLOCKROW MCU_buffer[D_MAX_BLOCKS_IN_MCU]; + + /* Temporary workspace for one MCU */ + JCOEF *workspace; + +#ifdef D_MULTISCAN_FILES_SUPPORTED + /* In multi-pass modes, we need a virtual block array for each component. */ + jvirt_barray_ptr whole_image[MAX_COMPONENTS]; +#endif + +#ifdef BLOCK_SMOOTHING_SUPPORTED + /* When doing block smoothing, we latch coefficient Al values here */ + int *coef_bits_latch; +#define SAVED_COEFS 6 /* we save coef_bits[0..5] */ +#endif +} my_coef_controller; + +typedef my_coef_controller *my_coef_ptr; + + +LOCAL(void) +start_iMCU_row (j_decompress_ptr cinfo) +/* Reset within-iMCU-row counters for a new row (input side) */ +{ + my_coef_ptr coef = (my_coef_ptr) cinfo->coef; + + /* In an interleaved scan, an MCU row is the same as an iMCU row. + * In a noninterleaved scan, an iMCU row has v_samp_factor MCU rows. + * But at the bottom of the image, process only what's left. + */ + if (cinfo->comps_in_scan > 1) { + coef->MCU_rows_per_iMCU_row = 1; + } else { + if (cinfo->input_iMCU_row < (cinfo->total_iMCU_rows-1)) + coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->v_samp_factor; + else + coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height; + } + + coef->MCU_ctr = 0; + coef->MCU_vert_offset = 0; +} diff --git a/media/libjpeg/jdcol565.c b/media/libjpeg/jdcol565.c new file mode 100644 index 000000000..349fce4a6 --- /dev/null +++ b/media/libjpeg/jdcol565.c @@ -0,0 +1,384 @@ +/* + * jdcol565.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1997, Thomas G. Lane. + * Modifications: + * Copyright (C) 2013, Linaro Limited. + * Copyright (C) 2014-2015, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains output colorspace conversion routines. + */ + +/* This file is included by jdcolor.c */ + + +INLINE +LOCAL(void) +ycc_rgb565_convert_internal (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert; + register int y, cb, cr; + register JSAMPROW outptr; + register JSAMPROW inptr0, inptr1, inptr2; + register JDIMENSION col; + JDIMENSION num_cols = cinfo->output_width; + /* copy these pointers into registers if possible */ + register JSAMPLE * range_limit = cinfo->sample_range_limit; + register int * Crrtab = cconvert->Cr_r_tab; + register int * Cbbtab = cconvert->Cb_b_tab; + register JLONG * Crgtab = cconvert->Cr_g_tab; + register JLONG * Cbgtab = cconvert->Cb_g_tab; + SHIFT_TEMPS + + while (--num_rows >= 0) { + JLONG rgb; + unsigned int r, g, b; + inptr0 = input_buf[0][input_row]; + inptr1 = input_buf[1][input_row]; + inptr2 = input_buf[2][input_row]; + input_row++; + outptr = *output_buf++; + + if (PACK_NEED_ALIGNMENT(outptr)) { + y = GETJSAMPLE(*inptr0++); + cb = GETJSAMPLE(*inptr1++); + cr = GETJSAMPLE(*inptr2++); + r = range_limit[y + Crrtab[cr]]; + g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], + SCALEBITS))]; + b = range_limit[y + Cbbtab[cb]]; + rgb = PACK_SHORT_565(r, g, b); + *(INT16*)outptr = (INT16)rgb; + outptr += 2; + num_cols--; + } + for (col = 0; col < (num_cols >> 1); col++) { + y = GETJSAMPLE(*inptr0++); + cb = GETJSAMPLE(*inptr1++); + cr = GETJSAMPLE(*inptr2++); + r = range_limit[y + Crrtab[cr]]; + g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], + SCALEBITS))]; + b = range_limit[y + Cbbtab[cb]]; + rgb = PACK_SHORT_565(r, g, b); + + y = GETJSAMPLE(*inptr0++); + cb = GETJSAMPLE(*inptr1++); + cr = GETJSAMPLE(*inptr2++); + r = range_limit[y + Crrtab[cr]]; + g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], + SCALEBITS))]; + b = range_limit[y + Cbbtab[cb]]; + rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b)); + + WRITE_TWO_ALIGNED_PIXELS(outptr, rgb); + outptr += 4; + } + if (num_cols & 1) { + y = GETJSAMPLE(*inptr0); + cb = GETJSAMPLE(*inptr1); + cr = GETJSAMPLE(*inptr2); + r = range_limit[y + Crrtab[cr]]; + g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], + SCALEBITS))]; + b = range_limit[y + Cbbtab[cb]]; + rgb = PACK_SHORT_565(r, g, b); + *(INT16*)outptr = (INT16)rgb; + } + } +} + + +INLINE +LOCAL(void) +ycc_rgb565D_convert_internal (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert; + register int y, cb, cr; + register JSAMPROW outptr; + register JSAMPROW inptr0, inptr1, inptr2; + register JDIMENSION col; + JDIMENSION num_cols = cinfo->output_width; + /* copy these pointers into registers if possible */ + register JSAMPLE * range_limit = cinfo->sample_range_limit; + register int * Crrtab = cconvert->Cr_r_tab; + register int * Cbbtab = cconvert->Cb_b_tab; + register JLONG * Crgtab = cconvert->Cr_g_tab; + register JLONG * Cbgtab = cconvert->Cb_g_tab; + JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK]; + SHIFT_TEMPS + + while (--num_rows >= 0) { + JLONG rgb; + unsigned int r, g, b; + + inptr0 = input_buf[0][input_row]; + inptr1 = input_buf[1][input_row]; + inptr2 = input_buf[2][input_row]; + input_row++; + outptr = *output_buf++; + if (PACK_NEED_ALIGNMENT(outptr)) { + y = GETJSAMPLE(*inptr0++); + cb = GETJSAMPLE(*inptr1++); + cr = GETJSAMPLE(*inptr2++); + r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)]; + g = range_limit[DITHER_565_G(y + + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], + SCALEBITS)), d0)]; + b = range_limit[DITHER_565_B(y + Cbbtab[cb], d0)]; + rgb = PACK_SHORT_565(r, g, b); + *(INT16*)outptr = (INT16)rgb; + outptr += 2; + num_cols--; + } + for (col = 0; col < (num_cols >> 1); col++) { + y = GETJSAMPLE(*inptr0++); + cb = GETJSAMPLE(*inptr1++); + cr = GETJSAMPLE(*inptr2++); + r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)]; + g = range_limit[DITHER_565_G(y + + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], + SCALEBITS)), d0)]; + b = range_limit[DITHER_565_B(y + Cbbtab[cb], d0)]; + d0 = DITHER_ROTATE(d0); + rgb = PACK_SHORT_565(r, g, b); + + y = GETJSAMPLE(*inptr0++); + cb = GETJSAMPLE(*inptr1++); + cr = GETJSAMPLE(*inptr2++); + r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)]; + g = range_limit[DITHER_565_G(y + + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], + SCALEBITS)), d0)]; + b = range_limit[DITHER_565_B(y + Cbbtab[cb], d0)]; + d0 = DITHER_ROTATE(d0); + rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b)); + + WRITE_TWO_ALIGNED_PIXELS(outptr, rgb); + outptr += 4; + } + if (num_cols & 1) { + y = GETJSAMPLE(*inptr0); + cb = GETJSAMPLE(*inptr1); + cr = GETJSAMPLE(*inptr2); + r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)]; + g = range_limit[DITHER_565_G(y + + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], + SCALEBITS)), d0)]; + b = range_limit[DITHER_565_B(y + Cbbtab[cb], d0)]; + rgb = PACK_SHORT_565(r, g, b); + *(INT16*)outptr = (INT16)rgb; + } + } +} + + +INLINE +LOCAL(void) +rgb_rgb565_convert_internal (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + register JSAMPROW outptr; + register JSAMPROW inptr0, inptr1, inptr2; + register JDIMENSION col; + JDIMENSION num_cols = cinfo->output_width; + SHIFT_TEMPS + + while (--num_rows >= 0) { + JLONG rgb; + unsigned int r, g, b; + + inptr0 = input_buf[0][input_row]; + inptr1 = input_buf[1][input_row]; + inptr2 = input_buf[2][input_row]; + input_row++; + outptr = *output_buf++; + if (PACK_NEED_ALIGNMENT(outptr)) { + r = GETJSAMPLE(*inptr0++); + g = GETJSAMPLE(*inptr1++); + b = GETJSAMPLE(*inptr2++); + rgb = PACK_SHORT_565(r, g, b); + *(INT16*)outptr = (INT16)rgb; + outptr += 2; + num_cols--; + } + for (col = 0; col < (num_cols >> 1); col++) { + r = GETJSAMPLE(*inptr0++); + g = GETJSAMPLE(*inptr1++); + b = GETJSAMPLE(*inptr2++); + rgb = PACK_SHORT_565(r, g, b); + + r = GETJSAMPLE(*inptr0++); + g = GETJSAMPLE(*inptr1++); + b = GETJSAMPLE(*inptr2++); + rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b)); + + WRITE_TWO_ALIGNED_PIXELS(outptr, rgb); + outptr += 4; + } + if (num_cols & 1) { + r = GETJSAMPLE(*inptr0); + g = GETJSAMPLE(*inptr1); + b = GETJSAMPLE(*inptr2); + rgb = PACK_SHORT_565(r, g, b); + *(INT16*)outptr = (INT16)rgb; + } + } +} + + +INLINE +LOCAL(void) +rgb_rgb565D_convert_internal (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + register JSAMPROW outptr; + register JSAMPROW inptr0, inptr1, inptr2; + register JDIMENSION col; + register JSAMPLE * range_limit = cinfo->sample_range_limit; + JDIMENSION num_cols = cinfo->output_width; + JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK]; + SHIFT_TEMPS + + while (--num_rows >= 0) { + JLONG rgb; + unsigned int r, g, b; + + inptr0 = input_buf[0][input_row]; + inptr1 = input_buf[1][input_row]; + inptr2 = input_buf[2][input_row]; + input_row++; + outptr = *output_buf++; + if (PACK_NEED_ALIGNMENT(outptr)) { + r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)]; + g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)]; + b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)]; + rgb = PACK_SHORT_565(r, g, b); + *(INT16*)outptr = (INT16)rgb; + outptr += 2; + num_cols--; + } + for (col = 0; col < (num_cols >> 1); col++) { + r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)]; + g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)]; + b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)]; + d0 = DITHER_ROTATE(d0); + rgb = PACK_SHORT_565(r, g, b); + + r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)]; + g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)]; + b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)]; + d0 = DITHER_ROTATE(d0); + rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b)); + + WRITE_TWO_ALIGNED_PIXELS(outptr, rgb); + outptr += 4; + } + if (num_cols & 1) { + r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0), d0)]; + g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1), d0)]; + b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2), d0)]; + rgb = PACK_SHORT_565(r, g, b); + *(INT16*)outptr = (INT16)rgb; + } + } +} + + +INLINE +LOCAL(void) +gray_rgb565_convert_internal (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + register JSAMPROW inptr, outptr; + register JDIMENSION col; + JDIMENSION num_cols = cinfo->output_width; + + while (--num_rows >= 0) { + JLONG rgb; + unsigned int g; + + inptr = input_buf[0][input_row++]; + outptr = *output_buf++; + if (PACK_NEED_ALIGNMENT(outptr)) { + g = *inptr++; + rgb = PACK_SHORT_565(g, g, g); + *(INT16*)outptr = (INT16)rgb; + outptr += 2; + num_cols--; + } + for (col = 0; col < (num_cols >> 1); col++) { + g = *inptr++; + rgb = PACK_SHORT_565(g, g, g); + g = *inptr++; + rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(g, g, g)); + WRITE_TWO_ALIGNED_PIXELS(outptr, rgb); + outptr += 4; + } + if (num_cols & 1) { + g = *inptr; + rgb = PACK_SHORT_565(g, g, g); + *(INT16*)outptr = (INT16)rgb; + } + } +} + + +INLINE +LOCAL(void) +gray_rgb565D_convert_internal (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + register JSAMPROW inptr, outptr; + register JDIMENSION col; + register JSAMPLE * range_limit = cinfo->sample_range_limit; + JDIMENSION num_cols = cinfo->output_width; + JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK]; + + while (--num_rows >= 0) { + JLONG rgb; + unsigned int g; + + inptr = input_buf[0][input_row++]; + outptr = *output_buf++; + if (PACK_NEED_ALIGNMENT(outptr)) { + g = *inptr++; + g = range_limit[DITHER_565_R(g, d0)]; + rgb = PACK_SHORT_565(g, g, g); + *(INT16*)outptr = (INT16)rgb; + outptr += 2; + num_cols--; + } + for (col = 0; col < (num_cols >> 1); col++) { + g = *inptr++; + g = range_limit[DITHER_565_R(g, d0)]; + rgb = PACK_SHORT_565(g, g, g); + d0 = DITHER_ROTATE(d0); + + g = *inptr++; + g = range_limit[DITHER_565_R(g, d0)]; + rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(g, g, g)); + d0 = DITHER_ROTATE(d0); + + WRITE_TWO_ALIGNED_PIXELS(outptr, rgb); + outptr += 4; + } + if (num_cols & 1) { + g = *inptr; + g = range_limit[DITHER_565_R(g, d0)]; + rgb = PACK_SHORT_565(g, g, g); + *(INT16*)outptr = (INT16)rgb; + } + } +} diff --git a/media/libjpeg/jdcolext.c b/media/libjpeg/jdcolext.c new file mode 100644 index 000000000..59b676cc4 --- /dev/null +++ b/media/libjpeg/jdcolext.c @@ -0,0 +1,143 @@ +/* + * jdcolext.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1997, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2009, 2011, 2015, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains output colorspace conversion routines. + */ + + +/* This file is included by jdcolor.c */ + + +/* + * Convert some rows of samples to the output colorspace. + * + * Note that we change from noninterleaved, one-plane-per-component format + * to interleaved-pixel format. The output buffer is therefore three times + * as wide as the input buffer. + * A starting row offset is provided only for the input buffer. The caller + * can easily adjust the passed output_buf value to accommodate any row + * offset required on that side. + */ + +INLINE +LOCAL(void) +ycc_rgb_convert_internal (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert; + register int y, cb, cr; + register JSAMPROW outptr; + register JSAMPROW inptr0, inptr1, inptr2; + register JDIMENSION col; + JDIMENSION num_cols = cinfo->output_width; + /* copy these pointers into registers if possible */ + register JSAMPLE * range_limit = cinfo->sample_range_limit; + register int * Crrtab = cconvert->Cr_r_tab; + register int * Cbbtab = cconvert->Cb_b_tab; + register JLONG * Crgtab = cconvert->Cr_g_tab; + register JLONG * Cbgtab = cconvert->Cb_g_tab; + SHIFT_TEMPS + + while (--num_rows >= 0) { + inptr0 = input_buf[0][input_row]; + inptr1 = input_buf[1][input_row]; + inptr2 = input_buf[2][input_row]; + input_row++; + outptr = *output_buf++; + for (col = 0; col < num_cols; col++) { + y = GETJSAMPLE(inptr0[col]); + cb = GETJSAMPLE(inptr1[col]); + cr = GETJSAMPLE(inptr2[col]); + /* Range-limiting is essential due to noise introduced by DCT losses. */ + outptr[RGB_RED] = range_limit[y + Crrtab[cr]]; + outptr[RGB_GREEN] = range_limit[y + + ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], + SCALEBITS))]; + outptr[RGB_BLUE] = range_limit[y + Cbbtab[cb]]; + /* Set unused byte to 0xFF so it can be interpreted as an opaque */ + /* alpha channel value */ +#ifdef RGB_ALPHA + outptr[RGB_ALPHA] = 0xFF; +#endif + outptr += RGB_PIXELSIZE; + } + } +} + + +/* + * Convert grayscale to RGB: just duplicate the graylevel three times. + * This is provided to support applications that don't want to cope + * with grayscale as a separate case. + */ + +INLINE +LOCAL(void) +gray_rgb_convert_internal (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + register JSAMPROW inptr, outptr; + register JDIMENSION col; + JDIMENSION num_cols = cinfo->output_width; + + while (--num_rows >= 0) { + inptr = input_buf[0][input_row++]; + outptr = *output_buf++; + for (col = 0; col < num_cols; col++) { + /* We can dispense with GETJSAMPLE() here */ + outptr[RGB_RED] = outptr[RGB_GREEN] = outptr[RGB_BLUE] = inptr[col]; + /* Set unused byte to 0xFF so it can be interpreted as an opaque */ + /* alpha channel value */ +#ifdef RGB_ALPHA + outptr[RGB_ALPHA] = 0xFF; +#endif + outptr += RGB_PIXELSIZE; + } + } +} + + +/* + * Convert RGB to extended RGB: just swap the order of source pixels + */ + +INLINE +LOCAL(void) +rgb_rgb_convert_internal (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + register JSAMPROW inptr0, inptr1, inptr2; + register JSAMPROW outptr; + register JDIMENSION col; + JDIMENSION num_cols = cinfo->output_width; + + while (--num_rows >= 0) { + inptr0 = input_buf[0][input_row]; + inptr1 = input_buf[1][input_row]; + inptr2 = input_buf[2][input_row]; + input_row++; + outptr = *output_buf++; + for (col = 0; col < num_cols; col++) { + /* We can dispense with GETJSAMPLE() here */ + outptr[RGB_RED] = inptr0[col]; + outptr[RGB_GREEN] = inptr1[col]; + outptr[RGB_BLUE] = inptr2[col]; + /* Set unused byte to 0xFF so it can be interpreted as an opaque */ + /* alpha channel value */ +#ifdef RGB_ALPHA + outptr[RGB_ALPHA] = 0xFF; +#endif + outptr += RGB_PIXELSIZE; + } + } +} diff --git a/media/libjpeg/jdcolor.c b/media/libjpeg/jdcolor.c new file mode 100644 index 000000000..ab8fa2492 --- /dev/null +++ b/media/libjpeg/jdcolor.c @@ -0,0 +1,897 @@ +/* + * jdcolor.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1997, Thomas G. Lane. + * Modified 2011 by Guido Vollbeding. + * libjpeg-turbo Modifications: + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2009, 2011-2012, 2014-2015, D. R. Commander. + * Copyright (C) 2013, Linaro Limited. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains output colorspace conversion routines. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jsimd.h" +#include "jconfigint.h" + + +/* Private subobject */ + +typedef struct { + struct jpeg_color_deconverter pub; /* public fields */ + + /* Private state for YCC->RGB conversion */ + int *Cr_r_tab; /* => table for Cr to R conversion */ + int *Cb_b_tab; /* => table for Cb to B conversion */ + JLONG *Cr_g_tab; /* => table for Cr to G conversion */ + JLONG *Cb_g_tab; /* => table for Cb to G conversion */ + + /* Private state for RGB->Y conversion */ + JLONG *rgb_y_tab; /* => table for RGB to Y conversion */ +} my_color_deconverter; + +typedef my_color_deconverter *my_cconvert_ptr; + + +/**************** YCbCr -> RGB conversion: most common case **************/ +/**************** RGB -> Y conversion: less common case **************/ + +/* + * YCbCr is defined per CCIR 601-1, except that Cb and Cr are + * normalized to the range 0..MAXJSAMPLE rather than -0.5 .. 0.5. + * The conversion equations to be implemented are therefore + * + * R = Y + 1.40200 * Cr + * G = Y - 0.34414 * Cb - 0.71414 * Cr + * B = Y + 1.77200 * Cb + * + * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + * + * where Cb and Cr represent the incoming values less CENTERJSAMPLE. + * (These numbers are derived from TIFF 6.0 section 21, dated 3-June-92.) + * + * To avoid floating-point arithmetic, we represent the fractional constants + * as integers scaled up by 2^16 (about 4 digits precision); we have to divide + * the products by 2^16, with appropriate rounding, to get the correct answer. + * Notice that Y, being an integral input, does not contribute any fraction + * so it need not participate in the rounding. + * + * For even more speed, we avoid doing any multiplications in the inner loop + * by precalculating the constants times Cb and Cr for all possible values. + * For 8-bit JSAMPLEs this is very reasonable (only 256 entries per table); + * for 12-bit samples it is still acceptable. It's not very reasonable for + * 16-bit samples, but if you want lossless storage you shouldn't be changing + * colorspace anyway. + * The Cr=>R and Cb=>B values can be rounded to integers in advance; the + * values for the G calculation are left scaled up, since we must add them + * together before rounding. + */ + +#define SCALEBITS 16 /* speediest right-shift on some machines */ +#define ONE_HALF ((JLONG) 1 << (SCALEBITS-1)) +#define FIX(x) ((JLONG) ((x) * (1L<<SCALEBITS) + 0.5)) + +/* We allocate one big table for RGB->Y conversion and divide it up into + * three parts, instead of doing three alloc_small requests. This lets us + * use a single table base address, which can be held in a register in the + * inner loops on many machines (more than can hold all three addresses, + * anyway). + */ + +#define R_Y_OFF 0 /* offset to R => Y section */ +#define G_Y_OFF (1*(MAXJSAMPLE+1)) /* offset to G => Y section */ +#define B_Y_OFF (2*(MAXJSAMPLE+1)) /* etc. */ +#define TABLE_SIZE (3*(MAXJSAMPLE+1)) + + +/* Include inline routines for colorspace extensions */ + +#include "jdcolext.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE + +#define RGB_RED EXT_RGB_RED +#define RGB_GREEN EXT_RGB_GREEN +#define RGB_BLUE EXT_RGB_BLUE +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define ycc_rgb_convert_internal ycc_extrgb_convert_internal +#define gray_rgb_convert_internal gray_extrgb_convert_internal +#define rgb_rgb_convert_internal rgb_extrgb_convert_internal +#include "jdcolext.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef ycc_rgb_convert_internal +#undef gray_rgb_convert_internal +#undef rgb_rgb_convert_internal + +#define RGB_RED EXT_RGBX_RED +#define RGB_GREEN EXT_RGBX_GREEN +#define RGB_BLUE EXT_RGBX_BLUE +#define RGB_ALPHA 3 +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define ycc_rgb_convert_internal ycc_extrgbx_convert_internal +#define gray_rgb_convert_internal gray_extrgbx_convert_internal +#define rgb_rgb_convert_internal rgb_extrgbx_convert_internal +#include "jdcolext.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef ycc_rgb_convert_internal +#undef gray_rgb_convert_internal +#undef rgb_rgb_convert_internal + +#define RGB_RED EXT_BGR_RED +#define RGB_GREEN EXT_BGR_GREEN +#define RGB_BLUE EXT_BGR_BLUE +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define ycc_rgb_convert_internal ycc_extbgr_convert_internal +#define gray_rgb_convert_internal gray_extbgr_convert_internal +#define rgb_rgb_convert_internal rgb_extbgr_convert_internal +#include "jdcolext.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef ycc_rgb_convert_internal +#undef gray_rgb_convert_internal +#undef rgb_rgb_convert_internal + +#define RGB_RED EXT_BGRX_RED +#define RGB_GREEN EXT_BGRX_GREEN +#define RGB_BLUE EXT_BGRX_BLUE +#define RGB_ALPHA 3 +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define ycc_rgb_convert_internal ycc_extbgrx_convert_internal +#define gray_rgb_convert_internal gray_extbgrx_convert_internal +#define rgb_rgb_convert_internal rgb_extbgrx_convert_internal +#include "jdcolext.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef ycc_rgb_convert_internal +#undef gray_rgb_convert_internal +#undef rgb_rgb_convert_internal + +#define RGB_RED EXT_XBGR_RED +#define RGB_GREEN EXT_XBGR_GREEN +#define RGB_BLUE EXT_XBGR_BLUE +#define RGB_ALPHA 0 +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define ycc_rgb_convert_internal ycc_extxbgr_convert_internal +#define gray_rgb_convert_internal gray_extxbgr_convert_internal +#define rgb_rgb_convert_internal rgb_extxbgr_convert_internal +#include "jdcolext.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef ycc_rgb_convert_internal +#undef gray_rgb_convert_internal +#undef rgb_rgb_convert_internal + +#define RGB_RED EXT_XRGB_RED +#define RGB_GREEN EXT_XRGB_GREEN +#define RGB_BLUE EXT_XRGB_BLUE +#define RGB_ALPHA 0 +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define ycc_rgb_convert_internal ycc_extxrgb_convert_internal +#define gray_rgb_convert_internal gray_extxrgb_convert_internal +#define rgb_rgb_convert_internal rgb_extxrgb_convert_internal +#include "jdcolext.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef ycc_rgb_convert_internal +#undef gray_rgb_convert_internal +#undef rgb_rgb_convert_internal + + +/* + * Initialize tables for YCC->RGB colorspace conversion. + */ + +LOCAL(void) +build_ycc_rgb_table (j_decompress_ptr cinfo) +{ + my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert; + int i; + JLONG x; + SHIFT_TEMPS + + cconvert->Cr_r_tab = (int *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + (MAXJSAMPLE+1) * sizeof(int)); + cconvert->Cb_b_tab = (int *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + (MAXJSAMPLE+1) * sizeof(int)); + cconvert->Cr_g_tab = (JLONG *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + (MAXJSAMPLE+1) * sizeof(JLONG)); + cconvert->Cb_g_tab = (JLONG *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + (MAXJSAMPLE+1) * sizeof(JLONG)); + + for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) { + /* i is the actual input pixel value, in the range 0..MAXJSAMPLE */ + /* The Cb or Cr value we are thinking of is x = i - CENTERJSAMPLE */ + /* Cr=>R value is nearest int to 1.40200 * x */ + cconvert->Cr_r_tab[i] = (int) + RIGHT_SHIFT(FIX(1.40200) * x + ONE_HALF, SCALEBITS); + /* Cb=>B value is nearest int to 1.77200 * x */ + cconvert->Cb_b_tab[i] = (int) + RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS); + /* Cr=>G value is scaled-up -0.71414 * x */ + cconvert->Cr_g_tab[i] = (- FIX(0.71414)) * x; + /* Cb=>G value is scaled-up -0.34414 * x */ + /* We also add in ONE_HALF so that need not do it in inner loop */ + cconvert->Cb_g_tab[i] = (- FIX(0.34414)) * x + ONE_HALF; + } +} + + +/* + * Convert some rows of samples to the output colorspace. + */ + +METHODDEF(void) +ycc_rgb_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + ycc_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf, + num_rows); + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + ycc_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf, + num_rows); + break; + case JCS_EXT_BGR: + ycc_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf, + num_rows); + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + ycc_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf, + num_rows); + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + ycc_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf, + num_rows); + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + ycc_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf, + num_rows); + break; + default: + ycc_rgb_convert_internal(cinfo, input_buf, input_row, output_buf, + num_rows); + break; + } +} + + +/**************** Cases other than YCbCr -> RGB **************/ + + +/* + * Initialize for RGB->grayscale colorspace conversion. + */ + +LOCAL(void) +build_rgb_y_table (j_decompress_ptr cinfo) +{ + my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert; + JLONG *rgb_y_tab; + JLONG i; + + /* Allocate and fill in the conversion tables. */ + cconvert->rgb_y_tab = rgb_y_tab = (JLONG *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + (TABLE_SIZE * sizeof(JLONG))); + + for (i = 0; i <= MAXJSAMPLE; i++) { + rgb_y_tab[i+R_Y_OFF] = FIX(0.29900) * i; + rgb_y_tab[i+G_Y_OFF] = FIX(0.58700) * i; + rgb_y_tab[i+B_Y_OFF] = FIX(0.11400) * i + ONE_HALF; + } +} + + +/* + * Convert RGB to grayscale. + */ + +METHODDEF(void) +rgb_gray_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert; + register int r, g, b; + register JLONG *ctab = cconvert->rgb_y_tab; + register JSAMPROW outptr; + register JSAMPROW inptr0, inptr1, inptr2; + register JDIMENSION col; + JDIMENSION num_cols = cinfo->output_width; + + while (--num_rows >= 0) { + inptr0 = input_buf[0][input_row]; + inptr1 = input_buf[1][input_row]; + inptr2 = input_buf[2][input_row]; + input_row++; + outptr = *output_buf++; + for (col = 0; col < num_cols; col++) { + r = GETJSAMPLE(inptr0[col]); + g = GETJSAMPLE(inptr1[col]); + b = GETJSAMPLE(inptr2[col]); + /* Y */ + outptr[col] = (JSAMPLE) + ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF]) + >> SCALEBITS); + } + } +} + + +/* + * Color conversion for no colorspace change: just copy the data, + * converting from separate-planes to interleaved representation. + */ + +METHODDEF(void) +null_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + register JSAMPROW inptr, inptr0, inptr1, inptr2, inptr3, outptr; + register JDIMENSION col; + register int num_components = cinfo->num_components; + JDIMENSION num_cols = cinfo->output_width; + int ci; + + if (num_components == 3) { + while (--num_rows >= 0) { + inptr0 = input_buf[0][input_row]; + inptr1 = input_buf[1][input_row]; + inptr2 = input_buf[2][input_row]; + input_row++; + outptr = *output_buf++; + for (col = 0; col < num_cols; col++) { + *outptr++ = inptr0[col]; + *outptr++ = inptr1[col]; + *outptr++ = inptr2[col]; + } + } + } else if (num_components == 4) { + while (--num_rows >= 0) { + inptr0 = input_buf[0][input_row]; + inptr1 = input_buf[1][input_row]; + inptr2 = input_buf[2][input_row]; + inptr3 = input_buf[3][input_row]; + input_row++; + outptr = *output_buf++; + for (col = 0; col < num_cols; col++) { + *outptr++ = inptr0[col]; + *outptr++ = inptr1[col]; + *outptr++ = inptr2[col]; + *outptr++ = inptr3[col]; + } + } + } else { + while (--num_rows >= 0) { + for (ci = 0; ci < num_components; ci++) { + inptr = input_buf[ci][input_row]; + outptr = *output_buf; + for (col = 0; col < num_cols; col++) { + outptr[ci] = inptr[col]; + outptr += num_components; + } + } + output_buf++; + input_row++; + } + } +} + + +/* + * Color conversion for grayscale: just copy the data. + * This also works for YCbCr -> grayscale conversion, in which + * we just copy the Y (luminance) component and ignore chrominance. + */ + +METHODDEF(void) +grayscale_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + jcopy_sample_rows(input_buf[0], (int) input_row, output_buf, 0, + num_rows, cinfo->output_width); +} + + +/* + * Convert grayscale to RGB + */ + +METHODDEF(void) +gray_rgb_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + gray_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf, + num_rows); + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + gray_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf, + num_rows); + break; + case JCS_EXT_BGR: + gray_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf, + num_rows); + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + gray_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf, + num_rows); + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + gray_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf, + num_rows); + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + gray_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf, + num_rows); + break; + default: + gray_rgb_convert_internal(cinfo, input_buf, input_row, output_buf, + num_rows); + break; + } +} + + +/* + * Convert plain RGB to extended RGB + */ + +METHODDEF(void) +rgb_rgb_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + rgb_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf, + num_rows); + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + rgb_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf, + num_rows); + break; + case JCS_EXT_BGR: + rgb_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf, + num_rows); + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + rgb_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf, + num_rows); + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + rgb_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf, + num_rows); + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + rgb_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf, + num_rows); + break; + default: + rgb_rgb_convert_internal(cinfo, input_buf, input_row, output_buf, + num_rows); + break; + } +} + + +/* + * Adobe-style YCCK->CMYK conversion. + * We convert YCbCr to R=1-C, G=1-M, and B=1-Y using the same + * conversion as above, while passing K (black) unchanged. + * We assume build_ycc_rgb_table has been called. + */ + +METHODDEF(void) +ycck_cmyk_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert; + register int y, cb, cr; + register JSAMPROW outptr; + register JSAMPROW inptr0, inptr1, inptr2, inptr3; + register JDIMENSION col; + JDIMENSION num_cols = cinfo->output_width; + /* copy these pointers into registers if possible */ + register JSAMPLE *range_limit = cinfo->sample_range_limit; + register int *Crrtab = cconvert->Cr_r_tab; + register int *Cbbtab = cconvert->Cb_b_tab; + register JLONG *Crgtab = cconvert->Cr_g_tab; + register JLONG *Cbgtab = cconvert->Cb_g_tab; + SHIFT_TEMPS + + while (--num_rows >= 0) { + inptr0 = input_buf[0][input_row]; + inptr1 = input_buf[1][input_row]; + inptr2 = input_buf[2][input_row]; + inptr3 = input_buf[3][input_row]; + input_row++; + outptr = *output_buf++; + for (col = 0; col < num_cols; col++) { + y = GETJSAMPLE(inptr0[col]); + cb = GETJSAMPLE(inptr1[col]); + cr = GETJSAMPLE(inptr2[col]); + /* Range-limiting is essential due to noise introduced by DCT losses. */ + outptr[0] = range_limit[MAXJSAMPLE - (y + Crrtab[cr])]; /* red */ + outptr[1] = range_limit[MAXJSAMPLE - (y + /* green */ + ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], + SCALEBITS)))]; + outptr[2] = range_limit[MAXJSAMPLE - (y + Cbbtab[cb])]; /* blue */ + /* K passes through unchanged */ + outptr[3] = inptr3[col]; /* don't need GETJSAMPLE here */ + outptr += 4; + } + } +} + + +/* + * RGB565 conversion + */ + +#define PACK_SHORT_565_LE(r, g, b) ((((r) << 8) & 0xF800) | \ + (((g) << 3) & 0x7E0) | ((b) >> 3)) +#define PACK_SHORT_565_BE(r, g, b) (((r) & 0xF8) | ((g) >> 5) | \ + (((g) << 11) & 0xE000) | \ + (((b) << 5) & 0x1F00)) + +#define PACK_TWO_PIXELS_LE(l, r) ((r << 16) | l) +#define PACK_TWO_PIXELS_BE(l, r) ((l << 16) | r) + +#define PACK_NEED_ALIGNMENT(ptr) (((size_t)(ptr)) & 3) + +#define WRITE_TWO_ALIGNED_PIXELS(addr, pixels) ((*(int *)(addr)) = pixels) + +#define DITHER_565_R(r, dither) ((r) + ((dither) & 0xFF)) +#define DITHER_565_G(g, dither) ((g) + (((dither) & 0xFF) >> 1)) +#define DITHER_565_B(b, dither) ((b) + ((dither) & 0xFF)) + + +/* Declarations for ordered dithering + * + * We use a 4x4 ordered dither array packed into 32 bits. This array is + * sufficent for dithering RGB888 to RGB565. + */ + +#define DITHER_MASK 0x3 +#define DITHER_ROTATE(x) ((((x) & 0xFF) << 24) | (((x) >> 8) & 0x00FFFFFF)) +static const JLONG dither_matrix[4] = { + 0x0008020A, + 0x0C040E06, + 0x030B0109, + 0x0F070D05 +}; + + +static INLINE boolean is_big_endian(void) +{ + int test_value = 1; + if(*(char *)&test_value != 1) + return TRUE; + return FALSE; +} + + +/* Include inline routines for RGB565 conversion */ + +#define PACK_SHORT_565 PACK_SHORT_565_LE +#define PACK_TWO_PIXELS PACK_TWO_PIXELS_LE +#define ycc_rgb565_convert_internal ycc_rgb565_convert_le +#define ycc_rgb565D_convert_internal ycc_rgb565D_convert_le +#define rgb_rgb565_convert_internal rgb_rgb565_convert_le +#define rgb_rgb565D_convert_internal rgb_rgb565D_convert_le +#define gray_rgb565_convert_internal gray_rgb565_convert_le +#define gray_rgb565D_convert_internal gray_rgb565D_convert_le +#include "jdcol565.c" +#undef PACK_SHORT_565 +#undef PACK_TWO_PIXELS +#undef ycc_rgb565_convert_internal +#undef ycc_rgb565D_convert_internal +#undef rgb_rgb565_convert_internal +#undef rgb_rgb565D_convert_internal +#undef gray_rgb565_convert_internal +#undef gray_rgb565D_convert_internal + +#define PACK_SHORT_565 PACK_SHORT_565_BE +#define PACK_TWO_PIXELS PACK_TWO_PIXELS_BE +#define ycc_rgb565_convert_internal ycc_rgb565_convert_be +#define ycc_rgb565D_convert_internal ycc_rgb565D_convert_be +#define rgb_rgb565_convert_internal rgb_rgb565_convert_be +#define rgb_rgb565D_convert_internal rgb_rgb565D_convert_be +#define gray_rgb565_convert_internal gray_rgb565_convert_be +#define gray_rgb565D_convert_internal gray_rgb565D_convert_be +#include "jdcol565.c" +#undef PACK_SHORT_565 +#undef PACK_TWO_PIXELS +#undef ycc_rgb565_convert_internal +#undef ycc_rgb565D_convert_internal +#undef rgb_rgb565_convert_internal +#undef rgb_rgb565D_convert_internal +#undef gray_rgb565_convert_internal +#undef gray_rgb565D_convert_internal + + +METHODDEF(void) +ycc_rgb565_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + if (is_big_endian()) + ycc_rgb565_convert_be(cinfo, input_buf, input_row, output_buf, num_rows); + else + ycc_rgb565_convert_le(cinfo, input_buf, input_row, output_buf, num_rows); +} + + +METHODDEF(void) +ycc_rgb565D_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + if (is_big_endian()) + ycc_rgb565D_convert_be(cinfo, input_buf, input_row, output_buf, num_rows); + else + ycc_rgb565D_convert_le(cinfo, input_buf, input_row, output_buf, num_rows); +} + + +METHODDEF(void) +rgb_rgb565_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + if (is_big_endian()) + rgb_rgb565_convert_be(cinfo, input_buf, input_row, output_buf, num_rows); + else + rgb_rgb565_convert_le(cinfo, input_buf, input_row, output_buf, num_rows); +} + + +METHODDEF(void) +rgb_rgb565D_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + if (is_big_endian()) + rgb_rgb565D_convert_be(cinfo, input_buf, input_row, output_buf, num_rows); + else + rgb_rgb565D_convert_le(cinfo, input_buf, input_row, output_buf, num_rows); +} + + +METHODDEF(void) +gray_rgb565_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + if (is_big_endian()) + gray_rgb565_convert_be(cinfo, input_buf, input_row, output_buf, num_rows); + else + gray_rgb565_convert_le(cinfo, input_buf, input_row, output_buf, num_rows); +} + + +METHODDEF(void) +gray_rgb565D_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + if (is_big_endian()) + gray_rgb565D_convert_be(cinfo, input_buf, input_row, output_buf, num_rows); + else + gray_rgb565D_convert_le(cinfo, input_buf, input_row, output_buf, num_rows); +} + + +/* + * Empty method for start_pass. + */ + +METHODDEF(void) +start_pass_dcolor (j_decompress_ptr cinfo) +{ + /* no work needed */ +} + + +/* + * Module initialization routine for output colorspace conversion. + */ + +GLOBAL(void) +jinit_color_deconverter (j_decompress_ptr cinfo) +{ + my_cconvert_ptr cconvert; + int ci; + + cconvert = (my_cconvert_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(my_color_deconverter)); + cinfo->cconvert = (struct jpeg_color_deconverter *) cconvert; + cconvert->pub.start_pass = start_pass_dcolor; + + /* Make sure num_components agrees with jpeg_color_space */ + switch (cinfo->jpeg_color_space) { + case JCS_GRAYSCALE: + if (cinfo->num_components != 1) + ERREXIT(cinfo, JERR_BAD_J_COLORSPACE); + break; + + case JCS_RGB: + case JCS_YCbCr: + if (cinfo->num_components != 3) + ERREXIT(cinfo, JERR_BAD_J_COLORSPACE); + break; + + case JCS_CMYK: + case JCS_YCCK: + if (cinfo->num_components != 4) + ERREXIT(cinfo, JERR_BAD_J_COLORSPACE); + break; + + default: /* JCS_UNKNOWN can be anything */ + if (cinfo->num_components < 1) + ERREXIT(cinfo, JERR_BAD_J_COLORSPACE); + break; + } + + /* Set out_color_components and conversion method based on requested space. + * Also clear the component_needed flags for any unused components, + * so that earlier pipeline stages can avoid useless computation. + */ + + switch (cinfo->out_color_space) { + case JCS_GRAYSCALE: + cinfo->out_color_components = 1; + if (cinfo->jpeg_color_space == JCS_GRAYSCALE || + cinfo->jpeg_color_space == JCS_YCbCr) { + cconvert->pub.color_convert = grayscale_convert; + /* For color->grayscale conversion, only the Y (0) component is needed */ + for (ci = 1; ci < cinfo->num_components; ci++) + cinfo->comp_info[ci].component_needed = FALSE; + } else if (cinfo->jpeg_color_space == JCS_RGB) { + cconvert->pub.color_convert = rgb_gray_convert; + build_rgb_y_table(cinfo); + } else + ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL); + break; + + case JCS_RGB: + case JCS_EXT_RGB: + case JCS_EXT_RGBX: + case JCS_EXT_BGR: + case JCS_EXT_BGRX: + case JCS_EXT_XBGR: + case JCS_EXT_XRGB: + case JCS_EXT_RGBA: + case JCS_EXT_BGRA: + case JCS_EXT_ABGR: + case JCS_EXT_ARGB: + cinfo->out_color_components = rgb_pixelsize[cinfo->out_color_space]; + if (cinfo->jpeg_color_space == JCS_YCbCr) { + if (jsimd_can_ycc_rgb()) + cconvert->pub.color_convert = jsimd_ycc_rgb_convert; + else { + cconvert->pub.color_convert = ycc_rgb_convert; + build_ycc_rgb_table(cinfo); + } + } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) { + cconvert->pub.color_convert = gray_rgb_convert; + } else if (cinfo->jpeg_color_space == JCS_RGB) { + if (rgb_red[cinfo->out_color_space] == 0 && + rgb_green[cinfo->out_color_space] == 1 && + rgb_blue[cinfo->out_color_space] == 2 && + rgb_pixelsize[cinfo->out_color_space] == 3) + cconvert->pub.color_convert = null_convert; + else + cconvert->pub.color_convert = rgb_rgb_convert; + } else + ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL); + break; + + case JCS_RGB565: + cinfo->out_color_components = 3; + if (cinfo->dither_mode == JDITHER_NONE) { + if (cinfo->jpeg_color_space == JCS_YCbCr) { + if (jsimd_can_ycc_rgb565()) + cconvert->pub.color_convert = jsimd_ycc_rgb565_convert; + else { + cconvert->pub.color_convert = ycc_rgb565_convert; + build_ycc_rgb_table(cinfo); + } + } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) { + cconvert->pub.color_convert = gray_rgb565_convert; + } else if (cinfo->jpeg_color_space == JCS_RGB) { + cconvert->pub.color_convert = rgb_rgb565_convert; + } else + ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL); + } else { + /* only ordered dithering is supported */ + if (cinfo->jpeg_color_space == JCS_YCbCr) { + cconvert->pub.color_convert = ycc_rgb565D_convert; + build_ycc_rgb_table(cinfo); + } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) { + cconvert->pub.color_convert = gray_rgb565D_convert; + } else if (cinfo->jpeg_color_space == JCS_RGB) { + cconvert->pub.color_convert = rgb_rgb565D_convert; + } else + ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL); + } + break; + + case JCS_CMYK: + cinfo->out_color_components = 4; + if (cinfo->jpeg_color_space == JCS_YCCK) { + cconvert->pub.color_convert = ycck_cmyk_convert; + build_ycc_rgb_table(cinfo); + } else if (cinfo->jpeg_color_space == JCS_CMYK) { + cconvert->pub.color_convert = null_convert; + } else + ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL); + break; + + default: + /* Permit null conversion to same output space */ + if (cinfo->out_color_space == cinfo->jpeg_color_space) { + cinfo->out_color_components = cinfo->num_components; + cconvert->pub.color_convert = null_convert; + } else /* unsupported non-null conversion */ + ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL); + break; + } + + if (cinfo->quantize_colors) + cinfo->output_components = 1; /* single colormapped output component */ + else + cinfo->output_components = cinfo->out_color_components; +} diff --git a/media/libjpeg/jdct.h b/media/libjpeg/jdct.h new file mode 100644 index 000000000..faf8e1cf0 --- /dev/null +++ b/media/libjpeg/jdct.h @@ -0,0 +1,208 @@ +/* + * jdct.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1994-1996, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2015, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This include file contains common declarations for the forward and + * inverse DCT modules. These declarations are private to the DCT managers + * (jcdctmgr.c, jddctmgr.c) and the individual DCT algorithms. + * The individual DCT algorithms are kept in separate files to ease + * machine-dependent tuning (e.g., assembly coding). + */ + + +/* + * A forward DCT routine is given a pointer to a work area of type DCTELEM[]; + * the DCT is to be performed in-place in that buffer. Type DCTELEM is int + * for 8-bit samples, JLONG for 12-bit samples. (NOTE: Floating-point DCT + * implementations use an array of type FAST_FLOAT, instead.) + * The DCT inputs are expected to be signed (range +-CENTERJSAMPLE). + * The DCT outputs are returned scaled up by a factor of 8; they therefore + * have a range of +-8K for 8-bit data, +-128K for 12-bit data. This + * convention improves accuracy in integer implementations and saves some + * work in floating-point ones. + * Quantization of the output coefficients is done by jcdctmgr.c. This + * step requires an unsigned type and also one with twice the bits. + */ + +#if BITS_IN_JSAMPLE == 8 +#ifndef WITH_SIMD +typedef int DCTELEM; /* 16 or 32 bits is fine */ +typedef unsigned int UDCTELEM; +typedef unsigned long long UDCTELEM2; +#else +typedef short DCTELEM; /* prefer 16 bit with SIMD for parellelism */ +typedef unsigned short UDCTELEM; +typedef unsigned int UDCTELEM2; +#endif +#else +typedef JLONG DCTELEM; /* must have 32 bits */ +typedef unsigned long long UDCTELEM2; +#endif + + +/* + * An inverse DCT routine is given a pointer to the input JBLOCK and a pointer + * to an output sample array. The routine must dequantize the input data as + * well as perform the IDCT; for dequantization, it uses the multiplier table + * pointed to by compptr->dct_table. The output data is to be placed into the + * sample array starting at a specified column. (Any row offset needed will + * be applied to the array pointer before it is passed to the IDCT code.) + * Note that the number of samples emitted by the IDCT routine is + * DCT_scaled_size * DCT_scaled_size. + */ + +/* typedef inverse_DCT_method_ptr is declared in jpegint.h */ + +/* + * Each IDCT routine has its own ideas about the best dct_table element type. + */ + +typedef MULTIPLIER ISLOW_MULT_TYPE; /* short or int, whichever is faster */ +#if BITS_IN_JSAMPLE == 8 +typedef MULTIPLIER IFAST_MULT_TYPE; /* 16 bits is OK, use short if faster */ +#define IFAST_SCALE_BITS 2 /* fractional bits in scale factors */ +#else +typedef JLONG IFAST_MULT_TYPE; /* need 32 bits for scaled quantizers */ +#define IFAST_SCALE_BITS 13 /* fractional bits in scale factors */ +#endif +typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */ + + +/* + * Each IDCT routine is responsible for range-limiting its results and + * converting them to unsigned form (0..MAXJSAMPLE). The raw outputs could + * be quite far out of range if the input data is corrupt, so a bulletproof + * range-limiting step is required. We use a mask-and-table-lookup method + * to do the combined operations quickly. See the comments with + * prepare_range_limit_table (in jdmaster.c) for more info. + */ + +#define IDCT_range_limit(cinfo) ((cinfo)->sample_range_limit + CENTERJSAMPLE) + +#define RANGE_MASK (MAXJSAMPLE * 4 + 3) /* 2 bits wider than legal samples */ + + +/* Extern declarations for the forward and inverse DCT routines. */ + +EXTERN(void) jpeg_fdct_islow (DCTELEM *data); +EXTERN(void) jpeg_fdct_ifast (DCTELEM *data); +EXTERN(void) jpeg_fdct_float (FAST_FLOAT *data); + +EXTERN(void) jpeg_idct_islow + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_ifast + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_float + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_7x7 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_6x6 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_5x5 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_4x4 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_3x3 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_2x2 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_1x1 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_9x9 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_10x10 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_11x11 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_12x12 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_13x13 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_14x14 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_15x15 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); +EXTERN(void) jpeg_idct_16x16 + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); + + +/* + * Macros for handling fixed-point arithmetic; these are used by many + * but not all of the DCT/IDCT modules. + * + * All values are expected to be of type JLONG. + * Fractional constants are scaled left by CONST_BITS bits. + * CONST_BITS is defined within each module using these macros, + * and may differ from one module to the next. + */ + +#define ONE ((JLONG) 1) +#define CONST_SCALE (ONE << CONST_BITS) + +/* Convert a positive real constant to an integer scaled by CONST_SCALE. + * Caution: some C compilers fail to reduce "FIX(constant)" at compile time, + * thus causing a lot of useless floating-point operations at run time. + */ + +#define FIX(x) ((JLONG) ((x) * CONST_SCALE + 0.5)) + +/* Descale and correctly round a JLONG value that's scaled by N bits. + * We assume RIGHT_SHIFT rounds towards minus infinity, so adding + * the fudge factor is correct for either sign of X. + */ + +#define DESCALE(x,n) RIGHT_SHIFT((x) + (ONE << ((n)-1)), n) + +/* Multiply a JLONG variable by a JLONG constant to yield a JLONG result. + * This macro is used only when the two inputs will actually be no more than + * 16 bits wide, so that a 16x16->32 bit multiply can be used instead of a + * full 32x32 multiply. This provides a useful speedup on many machines. + * Unfortunately there is no way to specify a 16x16->32 multiply portably + * in C, but some C compilers will do the right thing if you provide the + * correct combination of casts. + */ + +#ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */ +#define MULTIPLY16C16(var,const) (((INT16) (var)) * ((INT16) (const))) +#endif +#ifdef SHORTxLCONST_32 /* known to work with Microsoft C 6.0 */ +#define MULTIPLY16C16(var,const) (((INT16) (var)) * ((JLONG) (const))) +#endif + +#ifndef MULTIPLY16C16 /* default definition */ +#define MULTIPLY16C16(var,const) ((var) * (const)) +#endif + +/* Same except both inputs are variables. */ + +#ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */ +#define MULTIPLY16V16(var1,var2) (((INT16) (var1)) * ((INT16) (var2))) +#endif + +#ifndef MULTIPLY16V16 /* default definition */ +#define MULTIPLY16V16(var1,var2) ((var1) * (var2)) +#endif diff --git a/media/libjpeg/jddctmgr.c b/media/libjpeg/jddctmgr.c new file mode 100644 index 000000000..3a5ba7e89 --- /dev/null +++ b/media/libjpeg/jddctmgr.c @@ -0,0 +1,352 @@ +/* + * jddctmgr.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1994-1996, Thomas G. Lane. + * Modified 2002-2010 by Guido Vollbeding. + * libjpeg-turbo Modifications: + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2010, 2015, D. R. Commander. + * Copyright (C) 2013, MIPS Technologies, Inc., California. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains the inverse-DCT management logic. + * This code selects a particular IDCT implementation to be used, + * and it performs related housekeeping chores. No code in this file + * is executed per IDCT step, only during output pass setup. + * + * Note that the IDCT routines are responsible for performing coefficient + * dequantization as well as the IDCT proper. This module sets up the + * dequantization multiplier table needed by the IDCT routine. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jdct.h" /* Private declarations for DCT subsystem */ +#include "jsimddct.h" +#include "jpegcomp.h" + + +/* + * The decompressor input side (jdinput.c) saves away the appropriate + * quantization table for each component at the start of the first scan + * involving that component. (This is necessary in order to correctly + * decode files that reuse Q-table slots.) + * When we are ready to make an output pass, the saved Q-table is converted + * to a multiplier table that will actually be used by the IDCT routine. + * The multiplier table contents are IDCT-method-dependent. To support + * application changes in IDCT method between scans, we can remake the + * multiplier tables if necessary. + * In buffered-image mode, the first output pass may occur before any data + * has been seen for some components, and thus before their Q-tables have + * been saved away. To handle this case, multiplier tables are preset + * to zeroes; the result of the IDCT will be a neutral gray level. + */ + + +/* Private subobject for this module */ + +typedef struct { + struct jpeg_inverse_dct pub; /* public fields */ + + /* This array contains the IDCT method code that each multiplier table + * is currently set up for, or -1 if it's not yet set up. + * The actual multiplier tables are pointed to by dct_table in the + * per-component comp_info structures. + */ + int cur_method[MAX_COMPONENTS]; +} my_idct_controller; + +typedef my_idct_controller *my_idct_ptr; + + +/* Allocated multiplier tables: big enough for any supported variant */ + +typedef union { + ISLOW_MULT_TYPE islow_array[DCTSIZE2]; +#ifdef DCT_IFAST_SUPPORTED + IFAST_MULT_TYPE ifast_array[DCTSIZE2]; +#endif +#ifdef DCT_FLOAT_SUPPORTED + FLOAT_MULT_TYPE float_array[DCTSIZE2]; +#endif +} multiplier_table; + + +/* The current scaled-IDCT routines require ISLOW-style multiplier tables, + * so be sure to compile that code if either ISLOW or SCALING is requested. + */ +#ifdef DCT_ISLOW_SUPPORTED +#define PROVIDE_ISLOW_TABLES +#else +#ifdef IDCT_SCALING_SUPPORTED +#define PROVIDE_ISLOW_TABLES +#endif +#endif + + +/* + * Prepare for an output pass. + * Here we select the proper IDCT routine for each component and build + * a matching multiplier table. + */ + +METHODDEF(void) +start_pass (j_decompress_ptr cinfo) +{ + my_idct_ptr idct = (my_idct_ptr) cinfo->idct; + int ci, i; + jpeg_component_info *compptr; + int method = 0; + inverse_DCT_method_ptr method_ptr = NULL; + JQUANT_TBL *qtbl; + + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + /* Select the proper IDCT routine for this component's scaling */ + switch (compptr->_DCT_scaled_size) { +#ifdef IDCT_SCALING_SUPPORTED + case 1: + method_ptr = jpeg_idct_1x1; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case 2: + if (jsimd_can_idct_2x2()) + method_ptr = jsimd_idct_2x2; + else + method_ptr = jpeg_idct_2x2; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case 3: + method_ptr = jpeg_idct_3x3; + method = JDCT_ISLOW; /* jidctint uses islow-style table */ + break; + case 4: + if (jsimd_can_idct_4x4()) + method_ptr = jsimd_idct_4x4; + else + method_ptr = jpeg_idct_4x4; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case 5: + method_ptr = jpeg_idct_5x5; + method = JDCT_ISLOW; /* jidctint uses islow-style table */ + break; + case 6: +#if defined(__mips__) + if (jsimd_can_idct_6x6()) + method_ptr = jsimd_idct_6x6; + else +#endif + method_ptr = jpeg_idct_6x6; + method = JDCT_ISLOW; /* jidctint uses islow-style table */ + break; + case 7: + method_ptr = jpeg_idct_7x7; + method = JDCT_ISLOW; /* jidctint uses islow-style table */ + break; +#endif + case DCTSIZE: + switch (cinfo->dct_method) { +#ifdef DCT_ISLOW_SUPPORTED + case JDCT_ISLOW: + if (jsimd_can_idct_islow()) + method_ptr = jsimd_idct_islow; + else + method_ptr = jpeg_idct_islow; + method = JDCT_ISLOW; + break; +#endif +#ifdef DCT_IFAST_SUPPORTED + case JDCT_IFAST: + if (jsimd_can_idct_ifast()) + method_ptr = jsimd_idct_ifast; + else + method_ptr = jpeg_idct_ifast; + method = JDCT_IFAST; + break; +#endif +#ifdef DCT_FLOAT_SUPPORTED + case JDCT_FLOAT: + if (jsimd_can_idct_float()) + method_ptr = jsimd_idct_float; + else + method_ptr = jpeg_idct_float; + method = JDCT_FLOAT; + break; +#endif + default: + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + } + break; +#ifdef IDCT_SCALING_SUPPORTED + case 9: + method_ptr = jpeg_idct_9x9; + method = JDCT_ISLOW; /* jidctint uses islow-style table */ + break; + case 10: + method_ptr = jpeg_idct_10x10; + method = JDCT_ISLOW; /* jidctint uses islow-style table */ + break; + case 11: + method_ptr = jpeg_idct_11x11; + method = JDCT_ISLOW; /* jidctint uses islow-style table */ + break; + case 12: +#if defined(__mips__) + if (jsimd_can_idct_12x12()) + method_ptr = jsimd_idct_12x12; + else +#endif + method_ptr = jpeg_idct_12x12; + method = JDCT_ISLOW; /* jidctint uses islow-style table */ + break; + case 13: + method_ptr = jpeg_idct_13x13; + method = JDCT_ISLOW; /* jidctint uses islow-style table */ + break; + case 14: + method_ptr = jpeg_idct_14x14; + method = JDCT_ISLOW; /* jidctint uses islow-style table */ + break; + case 15: + method_ptr = jpeg_idct_15x15; + method = JDCT_ISLOW; /* jidctint uses islow-style table */ + break; + case 16: + method_ptr = jpeg_idct_16x16; + method = JDCT_ISLOW; /* jidctint uses islow-style table */ + break; +#endif + default: + ERREXIT1(cinfo, JERR_BAD_DCTSIZE, compptr->_DCT_scaled_size); + break; + } + idct->pub.inverse_DCT[ci] = method_ptr; + /* Create multiplier table from quant table. + * However, we can skip this if the component is uninteresting + * or if we already built the table. Also, if no quant table + * has yet been saved for the component, we leave the + * multiplier table all-zero; we'll be reading zeroes from the + * coefficient controller's buffer anyway. + */ + if (! compptr->component_needed || idct->cur_method[ci] == method) + continue; + qtbl = compptr->quant_table; + if (qtbl == NULL) /* happens if no data yet for component */ + continue; + idct->cur_method[ci] = method; + switch (method) { +#ifdef PROVIDE_ISLOW_TABLES + case JDCT_ISLOW: + { + /* For LL&M IDCT method, multipliers are equal to raw quantization + * coefficients, but are stored as ints to ensure access efficiency. + */ + ISLOW_MULT_TYPE *ismtbl = (ISLOW_MULT_TYPE *) compptr->dct_table; + for (i = 0; i < DCTSIZE2; i++) { + ismtbl[i] = (ISLOW_MULT_TYPE) qtbl->quantval[i]; + } + } + break; +#endif +#ifdef DCT_IFAST_SUPPORTED + case JDCT_IFAST: + { + /* For AA&N IDCT method, multipliers are equal to quantization + * coefficients scaled by scalefactor[row]*scalefactor[col], where + * scalefactor[0] = 1 + * scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 + * For integer operation, the multiplier table is to be scaled by + * IFAST_SCALE_BITS. + */ + IFAST_MULT_TYPE *ifmtbl = (IFAST_MULT_TYPE *) compptr->dct_table; +#define CONST_BITS 14 + static const INT16 aanscales[DCTSIZE2] = { + /* precomputed values scaled up by 14 bits */ + 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520, + 22725, 31521, 29692, 26722, 22725, 17855, 12299, 6270, + 21407, 29692, 27969, 25172, 21407, 16819, 11585, 5906, + 19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315, + 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520, + 12873, 17855, 16819, 15137, 12873, 10114, 6967, 3552, + 8867, 12299, 11585, 10426, 8867, 6967, 4799, 2446, + 4520, 6270, 5906, 5315, 4520, 3552, 2446, 1247 + }; + SHIFT_TEMPS + + for (i = 0; i < DCTSIZE2; i++) { + ifmtbl[i] = (IFAST_MULT_TYPE) + DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i], + (JLONG) aanscales[i]), + CONST_BITS-IFAST_SCALE_BITS); + } + } + break; +#endif +#ifdef DCT_FLOAT_SUPPORTED + case JDCT_FLOAT: + { + /* For float AA&N IDCT method, multipliers are equal to quantization + * coefficients scaled by scalefactor[row]*scalefactor[col], where + * scalefactor[0] = 1 + * scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 + */ + FLOAT_MULT_TYPE *fmtbl = (FLOAT_MULT_TYPE *) compptr->dct_table; + int row, col; + static const double aanscalefactor[DCTSIZE] = { + 1.0, 1.387039845, 1.306562965, 1.175875602, + 1.0, 0.785694958, 0.541196100, 0.275899379 + }; + + i = 0; + for (row = 0; row < DCTSIZE; row++) { + for (col = 0; col < DCTSIZE; col++) { + fmtbl[i] = (FLOAT_MULT_TYPE) + ((double) qtbl->quantval[i] * + aanscalefactor[row] * aanscalefactor[col]); + i++; + } + } + } + break; +#endif + default: + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + } + } +} + + +/* + * Initialize IDCT manager. + */ + +GLOBAL(void) +jinit_inverse_dct (j_decompress_ptr cinfo) +{ + my_idct_ptr idct; + int ci; + jpeg_component_info *compptr; + + idct = (my_idct_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(my_idct_controller)); + cinfo->idct = (struct jpeg_inverse_dct *) idct; + idct->pub.start_pass = start_pass; + + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + /* Allocate and pre-zero a multiplier table for each component */ + compptr->dct_table = + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(multiplier_table)); + MEMZERO(compptr->dct_table, sizeof(multiplier_table)); + /* Mark multiplier table not yet set up for any method */ + idct->cur_method[ci] = -1; + } +} diff --git a/media/libjpeg/jdhuff.c b/media/libjpeg/jdhuff.c new file mode 100644 index 000000000..fa78e2962 --- /dev/null +++ b/media/libjpeg/jdhuff.c @@ -0,0 +1,823 @@ +/* + * jdhuff.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1997, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2009-2011, 2016, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains Huffman entropy decoding routines. + * + * Much of the complexity here has to do with supporting input suspension. + * If the data source module demands suspension, we want to be able to back + * up to the start of the current MCU. To do this, we copy state variables + * into local working storage, and update them back to the permanent + * storage only upon successful completion of an MCU. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jdhuff.h" /* Declarations shared with jdphuff.c */ +#include "jpegcomp.h" +#include "jstdhuff.c" + + +/* + * Expanded entropy decoder object for Huffman decoding. + * + * The savable_state subrecord contains fields that change within an MCU, + * but must not be updated permanently until we complete the MCU. + */ + +typedef struct { + int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */ +} savable_state; + +/* This macro is to work around compilers with missing or broken + * structure assignment. You'll need to fix this code if you have + * such a compiler and you change MAX_COMPS_IN_SCAN. + */ + +#ifndef NO_STRUCT_ASSIGN +#define ASSIGN_STATE(dest,src) ((dest) = (src)) +#else +#if MAX_COMPS_IN_SCAN == 4 +#define ASSIGN_STATE(dest,src) \ + ((dest).last_dc_val[0] = (src).last_dc_val[0], \ + (dest).last_dc_val[1] = (src).last_dc_val[1], \ + (dest).last_dc_val[2] = (src).last_dc_val[2], \ + (dest).last_dc_val[3] = (src).last_dc_val[3]) +#endif +#endif + + +typedef struct { + struct jpeg_entropy_decoder pub; /* public fields */ + + /* These fields are loaded into local variables at start of each MCU. + * In case of suspension, we exit WITHOUT updating them. + */ + bitread_perm_state bitstate; /* Bit buffer at start of MCU */ + savable_state saved; /* Other state at start of MCU */ + + /* These fields are NOT loaded into local working state. */ + unsigned int restarts_to_go; /* MCUs left in this restart interval */ + + /* Pointers to derived tables (these workspaces have image lifespan) */ + d_derived_tbl *dc_derived_tbls[NUM_HUFF_TBLS]; + d_derived_tbl *ac_derived_tbls[NUM_HUFF_TBLS]; + + /* Precalculated info set up by start_pass for use in decode_mcu: */ + + /* Pointers to derived tables to be used for each block within an MCU */ + d_derived_tbl *dc_cur_tbls[D_MAX_BLOCKS_IN_MCU]; + d_derived_tbl *ac_cur_tbls[D_MAX_BLOCKS_IN_MCU]; + /* Whether we care about the DC and AC coefficient values for each block */ + boolean dc_needed[D_MAX_BLOCKS_IN_MCU]; + boolean ac_needed[D_MAX_BLOCKS_IN_MCU]; +} huff_entropy_decoder; + +typedef huff_entropy_decoder *huff_entropy_ptr; + + +/* + * Initialize for a Huffman-compressed scan. + */ + +METHODDEF(void) +start_pass_huff_decoder (j_decompress_ptr cinfo) +{ + huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy; + int ci, blkn, dctbl, actbl; + d_derived_tbl **pdtbl; + jpeg_component_info *compptr; + + /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG. + * This ought to be an error condition, but we make it a warning because + * there are some baseline files out there with all zeroes in these bytes. + */ + if (cinfo->Ss != 0 || cinfo->Se != DCTSIZE2-1 || + cinfo->Ah != 0 || cinfo->Al != 0) + WARNMS(cinfo, JWRN_NOT_SEQUENTIAL); + + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { + compptr = cinfo->cur_comp_info[ci]; + dctbl = compptr->dc_tbl_no; + actbl = compptr->ac_tbl_no; + /* Compute derived values for Huffman tables */ + /* We may do this more than once for a table, but it's not expensive */ + pdtbl = (d_derived_tbl **)(entropy->dc_derived_tbls) + dctbl; + jpeg_make_d_derived_tbl(cinfo, TRUE, dctbl, pdtbl); + pdtbl = (d_derived_tbl **)(entropy->ac_derived_tbls) + actbl; + jpeg_make_d_derived_tbl(cinfo, FALSE, actbl, pdtbl); + /* Initialize DC predictions to 0 */ + entropy->saved.last_dc_val[ci] = 0; + } + + /* Precalculate decoding info for each block in an MCU of this scan */ + for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) { + ci = cinfo->MCU_membership[blkn]; + compptr = cinfo->cur_comp_info[ci]; + /* Precalculate which table to use for each block */ + entropy->dc_cur_tbls[blkn] = entropy->dc_derived_tbls[compptr->dc_tbl_no]; + entropy->ac_cur_tbls[blkn] = entropy->ac_derived_tbls[compptr->ac_tbl_no]; + /* Decide whether we really care about the coefficient values */ + if (compptr->component_needed) { + entropy->dc_needed[blkn] = TRUE; + /* we don't need the ACs if producing a 1/8th-size image */ + entropy->ac_needed[blkn] = (compptr->_DCT_scaled_size > 1); + } else { + entropy->dc_needed[blkn] = entropy->ac_needed[blkn] = FALSE; + } + } + + /* Initialize bitread state variables */ + entropy->bitstate.bits_left = 0; + entropy->bitstate.get_buffer = 0; /* unnecessary, but keeps Purify quiet */ + entropy->pub.insufficient_data = FALSE; + + /* Initialize restart counter */ + entropy->restarts_to_go = cinfo->restart_interval; +} + + +/* + * Compute the derived values for a Huffman table. + * This routine also performs some validation checks on the table. + * + * Note this is also used by jdphuff.c. + */ + +GLOBAL(void) +jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno, + d_derived_tbl **pdtbl) +{ + JHUFF_TBL *htbl; + d_derived_tbl *dtbl; + int p, i, l, si, numsymbols; + int lookbits, ctr; + char huffsize[257]; + unsigned int huffcode[257]; + unsigned int code; + + /* Note that huffsize[] and huffcode[] are filled in code-length order, + * paralleling the order of the symbols themselves in htbl->huffval[]. + */ + + /* Find the input Huffman table */ + if (tblno < 0 || tblno >= NUM_HUFF_TBLS) + ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, tblno); + htbl = + isDC ? cinfo->dc_huff_tbl_ptrs[tblno] : cinfo->ac_huff_tbl_ptrs[tblno]; + if (htbl == NULL) + ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, tblno); + + /* Allocate a workspace if we haven't already done so. */ + if (*pdtbl == NULL) + *pdtbl = (d_derived_tbl *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(d_derived_tbl)); + dtbl = *pdtbl; + dtbl->pub = htbl; /* fill in back link */ + + /* Figure C.1: make table of Huffman code length for each symbol */ + + p = 0; + for (l = 1; l <= 16; l++) { + i = (int) htbl->bits[l]; + if (i < 0 || p + i > 256) /* protect against table overrun */ + ERREXIT(cinfo, JERR_BAD_HUFF_TABLE); + while (i--) + huffsize[p++] = (char) l; + } + huffsize[p] = 0; + numsymbols = p; + + /* Figure C.2: generate the codes themselves */ + /* We also validate that the counts represent a legal Huffman code tree. */ + + code = 0; + si = huffsize[0]; + p = 0; + while (huffsize[p]) { + while (((int) huffsize[p]) == si) { + huffcode[p++] = code; + code++; + } + /* code is now 1 more than the last code used for codelength si; but + * it must still fit in si bits, since no code is allowed to be all ones. + */ + if (((JLONG) code) >= (((JLONG) 1) << si)) + ERREXIT(cinfo, JERR_BAD_HUFF_TABLE); + code <<= 1; + si++; + } + + /* Figure F.15: generate decoding tables for bit-sequential decoding */ + + p = 0; + for (l = 1; l <= 16; l++) { + if (htbl->bits[l]) { + /* valoffset[l] = huffval[] index of 1st symbol of code length l, + * minus the minimum code of length l + */ + dtbl->valoffset[l] = (JLONG) p - (JLONG) huffcode[p]; + p += htbl->bits[l]; + dtbl->maxcode[l] = huffcode[p-1]; /* maximum code of length l */ + } else { + dtbl->maxcode[l] = -1; /* -1 if no codes of this length */ + } + } + dtbl->valoffset[17] = 0; + dtbl->maxcode[17] = 0xFFFFFL; /* ensures jpeg_huff_decode terminates */ + + /* Compute lookahead tables to speed up decoding. + * First we set all the table entries to 0, indicating "too long"; + * then we iterate through the Huffman codes that are short enough and + * fill in all the entries that correspond to bit sequences starting + * with that code. + */ + + for (i = 0; i < (1 << HUFF_LOOKAHEAD); i++) + dtbl->lookup[i] = (HUFF_LOOKAHEAD + 1) << HUFF_LOOKAHEAD; + + p = 0; + for (l = 1; l <= HUFF_LOOKAHEAD; l++) { + for (i = 1; i <= (int) htbl->bits[l]; i++, p++) { + /* l = current code's length, p = its index in huffcode[] & huffval[]. */ + /* Generate left-justified code followed by all possible bit sequences */ + lookbits = huffcode[p] << (HUFF_LOOKAHEAD-l); + for (ctr = 1 << (HUFF_LOOKAHEAD-l); ctr > 0; ctr--) { + dtbl->lookup[lookbits] = (l << HUFF_LOOKAHEAD) | htbl->huffval[p]; + lookbits++; + } + } + } + + /* Validate symbols as being reasonable. + * For AC tables, we make no check, but accept all byte values 0..255. + * For DC tables, we require the symbols to be in range 0..15. + * (Tighter bounds could be applied depending on the data depth and mode, + * but this is sufficient to ensure safe decoding.) + */ + if (isDC) { + for (i = 0; i < numsymbols; i++) { + int sym = htbl->huffval[i]; + if (sym < 0 || sym > 15) + ERREXIT(cinfo, JERR_BAD_HUFF_TABLE); + } + } +} + + +/* + * Out-of-line code for bit fetching (shared with jdphuff.c). + * See jdhuff.h for info about usage. + * Note: current values of get_buffer and bits_left are passed as parameters, + * but are returned in the corresponding fields of the state struct. + * + * On most machines MIN_GET_BITS should be 25 to allow the full 32-bit width + * of get_buffer to be used. (On machines with wider words, an even larger + * buffer could be used.) However, on some machines 32-bit shifts are + * quite slow and take time proportional to the number of places shifted. + * (This is true with most PC compilers, for instance.) In this case it may + * be a win to set MIN_GET_BITS to the minimum value of 15. This reduces the + * average shift distance at the cost of more calls to jpeg_fill_bit_buffer. + */ + +#ifdef SLOW_SHIFT_32 +#define MIN_GET_BITS 15 /* minimum allowable value */ +#else +#define MIN_GET_BITS (BIT_BUF_SIZE-7) +#endif + + +GLOBAL(boolean) +jpeg_fill_bit_buffer (bitread_working_state *state, + register bit_buf_type get_buffer, register int bits_left, + int nbits) +/* Load up the bit buffer to a depth of at least nbits */ +{ + /* Copy heavily used state fields into locals (hopefully registers) */ + register const JOCTET *next_input_byte = state->next_input_byte; + register size_t bytes_in_buffer = state->bytes_in_buffer; + j_decompress_ptr cinfo = state->cinfo; + + /* Attempt to load at least MIN_GET_BITS bits into get_buffer. */ + /* (It is assumed that no request will be for more than that many bits.) */ + /* We fail to do so only if we hit a marker or are forced to suspend. */ + + if (cinfo->unread_marker == 0) { /* cannot advance past a marker */ + while (bits_left < MIN_GET_BITS) { + register int c; + + /* Attempt to read a byte */ + if (bytes_in_buffer == 0) { + if (! (*cinfo->src->fill_input_buffer) (cinfo)) + return FALSE; + next_input_byte = cinfo->src->next_input_byte; + bytes_in_buffer = cinfo->src->bytes_in_buffer; + } + bytes_in_buffer--; + c = GETJOCTET(*next_input_byte++); + + /* If it's 0xFF, check and discard stuffed zero byte */ + if (c == 0xFF) { + /* Loop here to discard any padding FF's on terminating marker, + * so that we can save a valid unread_marker value. NOTE: we will + * accept multiple FF's followed by a 0 as meaning a single FF data + * byte. This data pattern is not valid according to the standard. + */ + do { + if (bytes_in_buffer == 0) { + if (! (*cinfo->src->fill_input_buffer) (cinfo)) + return FALSE; + next_input_byte = cinfo->src->next_input_byte; + bytes_in_buffer = cinfo->src->bytes_in_buffer; + } + bytes_in_buffer--; + c = GETJOCTET(*next_input_byte++); + } while (c == 0xFF); + + if (c == 0) { + /* Found FF/00, which represents an FF data byte */ + c = 0xFF; + } else { + /* Oops, it's actually a marker indicating end of compressed data. + * Save the marker code for later use. + * Fine point: it might appear that we should save the marker into + * bitread working state, not straight into permanent state. But + * once we have hit a marker, we cannot need to suspend within the + * current MCU, because we will read no more bytes from the data + * source. So it is OK to update permanent state right away. + */ + cinfo->unread_marker = c; + /* See if we need to insert some fake zero bits. */ + goto no_more_bytes; + } + } + + /* OK, load c into get_buffer */ + get_buffer = (get_buffer << 8) | c; + bits_left += 8; + } /* end while */ + } else { + no_more_bytes: + /* We get here if we've read the marker that terminates the compressed + * data segment. There should be enough bits in the buffer register + * to satisfy the request; if so, no problem. + */ + if (nbits > bits_left) { + /* Uh-oh. Report corrupted data to user and stuff zeroes into + * the data stream, so that we can produce some kind of image. + * We use a nonvolatile flag to ensure that only one warning message + * appears per data segment. + */ + if (! cinfo->entropy->insufficient_data) { + WARNMS(cinfo, JWRN_HIT_MARKER); + cinfo->entropy->insufficient_data = TRUE; + } + /* Fill the buffer with zero bits */ + get_buffer <<= MIN_GET_BITS - bits_left; + bits_left = MIN_GET_BITS; + } + } + + /* Unload the local registers */ + state->next_input_byte = next_input_byte; + state->bytes_in_buffer = bytes_in_buffer; + state->get_buffer = get_buffer; + state->bits_left = bits_left; + + return TRUE; +} + + +/* Macro version of the above, which performs much better but does not + handle markers. We have to hand off any blocks with markers to the + slower routines. */ + +#define GET_BYTE \ +{ \ + register int c0, c1; \ + c0 = GETJOCTET(*buffer++); \ + c1 = GETJOCTET(*buffer); \ + /* Pre-execute most common case */ \ + get_buffer = (get_buffer << 8) | c0; \ + bits_left += 8; \ + if (c0 == 0xFF) { \ + /* Pre-execute case of FF/00, which represents an FF data byte */ \ + buffer++; \ + if (c1 != 0) { \ + /* Oops, it's actually a marker indicating end of compressed data. */ \ + cinfo->unread_marker = c1; \ + /* Back out pre-execution and fill the buffer with zero bits */ \ + buffer -= 2; \ + get_buffer &= ~0xFF; \ + } \ + } \ +} + +#if SIZEOF_SIZE_T==8 || defined(_WIN64) + +/* Pre-fetch 48 bytes, because the holding register is 64-bit */ +#define FILL_BIT_BUFFER_FAST \ + if (bits_left <= 16) { \ + GET_BYTE GET_BYTE GET_BYTE GET_BYTE GET_BYTE GET_BYTE \ + } + +#else + +/* Pre-fetch 16 bytes, because the holding register is 32-bit */ +#define FILL_BIT_BUFFER_FAST \ + if (bits_left <= 16) { \ + GET_BYTE GET_BYTE \ + } + +#endif + + +/* + * Out-of-line code for Huffman code decoding. + * See jdhuff.h for info about usage. + */ + +GLOBAL(int) +jpeg_huff_decode (bitread_working_state *state, + register bit_buf_type get_buffer, register int bits_left, + d_derived_tbl *htbl, int min_bits) +{ + register int l = min_bits; + register JLONG code; + + /* HUFF_DECODE has determined that the code is at least min_bits */ + /* bits long, so fetch that many bits in one swoop. */ + + CHECK_BIT_BUFFER(*state, l, return -1); + code = GET_BITS(l); + + /* Collect the rest of the Huffman code one bit at a time. */ + /* This is per Figure F.16 in the JPEG spec. */ + + while (code > htbl->maxcode[l]) { + code <<= 1; + CHECK_BIT_BUFFER(*state, 1, return -1); + code |= GET_BITS(1); + l++; + } + + /* Unload the local registers */ + state->get_buffer = get_buffer; + state->bits_left = bits_left; + + /* With garbage input we may reach the sentinel value l = 17. */ + + if (l > 16) { + WARNMS(state->cinfo, JWRN_HUFF_BAD_CODE); + return 0; /* fake a zero as the safest result */ + } + + return htbl->pub->huffval[ (int) (code + htbl->valoffset[l]) ]; +} + + +/* + * Figure F.12: extend sign bit. + * On some machines, a shift and add will be faster than a table lookup. + */ + +#define AVOID_TABLES +#ifdef AVOID_TABLES + +#define NEG_1 ((unsigned int)-1) +#define HUFF_EXTEND(x,s) ((x) + ((((x) - (1<<((s)-1))) >> 31) & (((NEG_1)<<(s)) + 1))) + +#else + +#define HUFF_EXTEND(x,s) ((x) < extend_test[s] ? (x) + extend_offset[s] : (x)) + +static const int extend_test[16] = /* entry n is 2**(n-1) */ + { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, + 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 }; + +static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */ + { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1, + ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1, + ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1, + ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 }; + +#endif /* AVOID_TABLES */ + + +/* + * Check for a restart marker & resynchronize decoder. + * Returns FALSE if must suspend. + */ + +LOCAL(boolean) +process_restart (j_decompress_ptr cinfo) +{ + huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy; + int ci; + + /* Throw away any unused bits remaining in bit buffer; */ + /* include any full bytes in next_marker's count of discarded bytes */ + cinfo->marker->discarded_bytes += entropy->bitstate.bits_left / 8; + entropy->bitstate.bits_left = 0; + + /* Advance past the RSTn marker */ + if (! (*cinfo->marker->read_restart_marker) (cinfo)) + return FALSE; + + /* Re-initialize DC predictions to 0 */ + for (ci = 0; ci < cinfo->comps_in_scan; ci++) + entropy->saved.last_dc_val[ci] = 0; + + /* Reset restart counter */ + entropy->restarts_to_go = cinfo->restart_interval; + + /* Reset out-of-data flag, unless read_restart_marker left us smack up + * against a marker. In that case we will end up treating the next data + * segment as empty, and we can avoid producing bogus output pixels by + * leaving the flag set. + */ + if (cinfo->unread_marker == 0) + entropy->pub.insufficient_data = FALSE; + + return TRUE; +} + + +LOCAL(boolean) +decode_mcu_slow (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) +{ + huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy; + BITREAD_STATE_VARS; + int blkn; + savable_state state; + /* Outer loop handles each block in the MCU */ + + /* Load up working state */ + BITREAD_LOAD_STATE(cinfo,entropy->bitstate); + ASSIGN_STATE(state, entropy->saved); + + for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) { + JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL; + d_derived_tbl *dctbl = entropy->dc_cur_tbls[blkn]; + d_derived_tbl *actbl = entropy->ac_cur_tbls[blkn]; + register int s, k, r; + + /* Decode a single block's worth of coefficients */ + + /* Section F.2.2.1: decode the DC coefficient difference */ + HUFF_DECODE(s, br_state, dctbl, return FALSE, label1); + if (s) { + CHECK_BIT_BUFFER(br_state, s, return FALSE); + r = GET_BITS(s); + s = HUFF_EXTEND(r, s); + } + + if (entropy->dc_needed[blkn]) { + /* Convert DC difference to actual value, update last_dc_val */ + int ci = cinfo->MCU_membership[blkn]; + s += state.last_dc_val[ci]; + state.last_dc_val[ci] = s; + if (block) { + /* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */ + (*block)[0] = (JCOEF) s; + } + } + + if (entropy->ac_needed[blkn] && block) { + + /* Section F.2.2.2: decode the AC coefficients */ + /* Since zeroes are skipped, output area must be cleared beforehand */ + for (k = 1; k < DCTSIZE2; k++) { + HUFF_DECODE(s, br_state, actbl, return FALSE, label2); + + r = s >> 4; + s &= 15; + + if (s) { + k += r; + CHECK_BIT_BUFFER(br_state, s, return FALSE); + r = GET_BITS(s); + s = HUFF_EXTEND(r, s); + /* Output coefficient in natural (dezigzagged) order. + * Note: the extra entries in jpeg_natural_order[] will save us + * if k >= DCTSIZE2, which could happen if the data is corrupted. + */ + (*block)[jpeg_natural_order[k]] = (JCOEF) s; + } else { + if (r != 15) + break; + k += 15; + } + } + + } else { + + /* Section F.2.2.2: decode the AC coefficients */ + /* In this path we just discard the values */ + for (k = 1; k < DCTSIZE2; k++) { + HUFF_DECODE(s, br_state, actbl, return FALSE, label3); + + r = s >> 4; + s &= 15; + + if (s) { + k += r; + CHECK_BIT_BUFFER(br_state, s, return FALSE); + DROP_BITS(s); + } else { + if (r != 15) + break; + k += 15; + } + } + } + } + + /* Completed MCU, so update state */ + BITREAD_SAVE_STATE(cinfo,entropy->bitstate); + ASSIGN_STATE(entropy->saved, state); + return TRUE; +} + + +LOCAL(boolean) +decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) +{ + huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy; + BITREAD_STATE_VARS; + JOCTET *buffer; + int blkn; + savable_state state; + /* Outer loop handles each block in the MCU */ + + /* Load up working state */ + BITREAD_LOAD_STATE(cinfo,entropy->bitstate); + buffer = (JOCTET *) br_state.next_input_byte; + ASSIGN_STATE(state, entropy->saved); + + for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) { + JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL; + d_derived_tbl *dctbl = entropy->dc_cur_tbls[blkn]; + d_derived_tbl *actbl = entropy->ac_cur_tbls[blkn]; + register int s, k, r, l; + + HUFF_DECODE_FAST(s, l, dctbl, slow_decode_mcu); + if (s) { + FILL_BIT_BUFFER_FAST + r = GET_BITS(s); + s = HUFF_EXTEND(r, s); + } + + if (entropy->dc_needed[blkn]) { + int ci = cinfo->MCU_membership[blkn]; + s += state.last_dc_val[ci]; + state.last_dc_val[ci] = s; + if (block) + (*block)[0] = (JCOEF) s; + } + + if (entropy->ac_needed[blkn] && block) { + + for (k = 1; k < DCTSIZE2; k++) { + HUFF_DECODE_FAST(s, l, actbl, slow_decode_mcu); + r = s >> 4; + s &= 15; + + if (s) { + k += r; + FILL_BIT_BUFFER_FAST + r = GET_BITS(s); + s = HUFF_EXTEND(r, s); + (*block)[jpeg_natural_order[k]] = (JCOEF) s; + } else { + if (r != 15) break; + k += 15; + } + } + + } else { + + for (k = 1; k < DCTSIZE2; k++) { + HUFF_DECODE_FAST(s, l, actbl, slow_decode_mcu); + r = s >> 4; + s &= 15; + + if (s) { + k += r; + FILL_BIT_BUFFER_FAST + DROP_BITS(s); + } else { + if (r != 15) break; + k += 15; + } + } + } + } + + if (cinfo->unread_marker != 0) { +slow_decode_mcu: + cinfo->unread_marker = 0; + return FALSE; + } + + br_state.bytes_in_buffer -= (buffer - br_state.next_input_byte); + br_state.next_input_byte = buffer; + BITREAD_SAVE_STATE(cinfo,entropy->bitstate); + ASSIGN_STATE(entropy->saved, state); + return TRUE; +} + + +/* + * Decode and return one MCU's worth of Huffman-compressed coefficients. + * The coefficients are reordered from zigzag order into natural array order, + * but are not dequantized. + * + * The i'th block of the MCU is stored into the block pointed to by + * MCU_data[i]. WE ASSUME THIS AREA HAS BEEN ZEROED BY THE CALLER. + * (Wholesale zeroing is usually a little faster than retail...) + * + * Returns FALSE if data source requested suspension. In that case no + * changes have been made to permanent state. (Exception: some output + * coefficients may already have been assigned. This is harmless for + * this module, since we'll just re-assign them on the next call.) + */ + +#define BUFSIZE (DCTSIZE2 * 8) + +METHODDEF(boolean) +decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) +{ + huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy; + int usefast = 1; + + /* Process restart marker if needed; may have to suspend */ + if (cinfo->restart_interval) { + if (entropy->restarts_to_go == 0) + if (! process_restart(cinfo)) + return FALSE; + usefast = 0; + } + + if (cinfo->src->bytes_in_buffer < BUFSIZE * (size_t)cinfo->blocks_in_MCU + || cinfo->unread_marker != 0) + usefast = 0; + + /* If we've run out of data, just leave the MCU set to zeroes. + * This way, we return uniform gray for the remainder of the segment. + */ + if (! entropy->pub.insufficient_data) { + + if (usefast) { + if (!decode_mcu_fast(cinfo, MCU_data)) goto use_slow; + } + else { + use_slow: + if (!decode_mcu_slow(cinfo, MCU_data)) return FALSE; + } + + } + + /* Account for restart interval (no-op if not using restarts) */ + entropy->restarts_to_go--; + + return TRUE; +} + + +/* + * Module initialization routine for Huffman entropy decoding. + */ + +GLOBAL(void) +jinit_huff_decoder (j_decompress_ptr cinfo) +{ + huff_entropy_ptr entropy; + int i; + + /* Motion JPEG frames typically do not include the Huffman tables if they + are the default tables. Thus, if the tables are not set by the time + the Huffman decoder is initialized (usually within the body of + jpeg_start_decompress()), we set them to default values. */ + std_huff_tables((j_common_ptr) cinfo); + + entropy = (huff_entropy_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(huff_entropy_decoder)); + cinfo->entropy = (struct jpeg_entropy_decoder *) entropy; + entropy->pub.start_pass = start_pass_huff_decoder; + entropy->pub.decode_mcu = decode_mcu; + + /* Mark tables unallocated */ + for (i = 0; i < NUM_HUFF_TBLS; i++) { + entropy->dc_derived_tbls[i] = entropy->ac_derived_tbls[i] = NULL; + } +} diff --git a/media/libjpeg/jdhuff.h b/media/libjpeg/jdhuff.h new file mode 100644 index 000000000..3f15d71a8 --- /dev/null +++ b/media/libjpeg/jdhuff.h @@ -0,0 +1,236 @@ +/* + * jdhuff.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1997, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2010-2011, 2015-2016, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains declarations for Huffman entropy decoding routines + * that are shared between the sequential decoder (jdhuff.c) and the + * progressive decoder (jdphuff.c). No other modules need to see these. + */ + +#include "jconfigint.h" + + +/* Derived data constructed for each Huffman table */ + +#define HUFF_LOOKAHEAD 8 /* # of bits of lookahead */ + +typedef struct { + /* Basic tables: (element [0] of each array is unused) */ + JLONG maxcode[18]; /* largest code of length k (-1 if none) */ + /* (maxcode[17] is a sentinel to ensure jpeg_huff_decode terminates) */ + JLONG valoffset[18]; /* huffval[] offset for codes of length k */ + /* valoffset[k] = huffval[] index of 1st symbol of code length k, less + * the smallest code of length k; so given a code of length k, the + * corresponding symbol is huffval[code + valoffset[k]] + */ + + /* Link to public Huffman table (needed only in jpeg_huff_decode) */ + JHUFF_TBL *pub; + + /* Lookahead table: indexed by the next HUFF_LOOKAHEAD bits of + * the input data stream. If the next Huffman code is no more + * than HUFF_LOOKAHEAD bits long, we can obtain its length and + * the corresponding symbol directly from this tables. + * + * The lower 8 bits of each table entry contain the number of + * bits in the corresponding Huffman code, or HUFF_LOOKAHEAD + 1 + * if too long. The next 8 bits of each entry contain the + * symbol. + */ + int lookup[1<<HUFF_LOOKAHEAD]; +} d_derived_tbl; + +/* Expand a Huffman table definition into the derived format */ +EXTERN(void) jpeg_make_d_derived_tbl + (j_decompress_ptr cinfo, boolean isDC, int tblno, + d_derived_tbl ** pdtbl); + + +/* + * Fetching the next N bits from the input stream is a time-critical operation + * for the Huffman decoders. We implement it with a combination of inline + * macros and out-of-line subroutines. Note that N (the number of bits + * demanded at one time) never exceeds 15 for JPEG use. + * + * We read source bytes into get_buffer and dole out bits as needed. + * If get_buffer already contains enough bits, they are fetched in-line + * by the macros CHECK_BIT_BUFFER and GET_BITS. When there aren't enough + * bits, jpeg_fill_bit_buffer is called; it will attempt to fill get_buffer + * as full as possible (not just to the number of bits needed; this + * prefetching reduces the overhead cost of calling jpeg_fill_bit_buffer). + * Note that jpeg_fill_bit_buffer may return FALSE to indicate suspension. + * On TRUE return, jpeg_fill_bit_buffer guarantees that get_buffer contains + * at least the requested number of bits --- dummy zeroes are inserted if + * necessary. + */ + +#if !defined(_WIN32) && !defined(SIZEOF_SIZE_T) +#error Cannot determine word size +#endif + +#if SIZEOF_SIZE_T==8 || defined(_WIN64) + +typedef size_t bit_buf_type; /* type of bit-extraction buffer */ +#define BIT_BUF_SIZE 64 /* size of buffer in bits */ + +#else + +typedef unsigned long bit_buf_type; /* type of bit-extraction buffer */ +#define BIT_BUF_SIZE 32 /* size of buffer in bits */ + +#endif + +/* If long is > 32 bits on your machine, and shifting/masking longs is + * reasonably fast, making bit_buf_type be long and setting BIT_BUF_SIZE + * appropriately should be a win. Unfortunately we can't define the size + * with something like #define BIT_BUF_SIZE (sizeof(bit_buf_type)*8) + * because not all machines measure sizeof in 8-bit bytes. + */ + +typedef struct { /* Bitreading state saved across MCUs */ + bit_buf_type get_buffer; /* current bit-extraction buffer */ + int bits_left; /* # of unused bits in it */ +} bitread_perm_state; + +typedef struct { /* Bitreading working state within an MCU */ + /* Current data source location */ + /* We need a copy, rather than munging the original, in case of suspension */ + const JOCTET *next_input_byte; /* => next byte to read from source */ + size_t bytes_in_buffer; /* # of bytes remaining in source buffer */ + /* Bit input buffer --- note these values are kept in register variables, + * not in this struct, inside the inner loops. + */ + bit_buf_type get_buffer; /* current bit-extraction buffer */ + int bits_left; /* # of unused bits in it */ + /* Pointer needed by jpeg_fill_bit_buffer. */ + j_decompress_ptr cinfo; /* back link to decompress master record */ +} bitread_working_state; + +/* Macros to declare and load/save bitread local variables. */ +#define BITREAD_STATE_VARS \ + register bit_buf_type get_buffer; \ + register int bits_left; \ + bitread_working_state br_state + +#define BITREAD_LOAD_STATE(cinfop,permstate) \ + br_state.cinfo = cinfop; \ + br_state.next_input_byte = cinfop->src->next_input_byte; \ + br_state.bytes_in_buffer = cinfop->src->bytes_in_buffer; \ + get_buffer = permstate.get_buffer; \ + bits_left = permstate.bits_left; + +#define BITREAD_SAVE_STATE(cinfop,permstate) \ + cinfop->src->next_input_byte = br_state.next_input_byte; \ + cinfop->src->bytes_in_buffer = br_state.bytes_in_buffer; \ + permstate.get_buffer = get_buffer; \ + permstate.bits_left = bits_left + +/* + * These macros provide the in-line portion of bit fetching. + * Use CHECK_BIT_BUFFER to ensure there are N bits in get_buffer + * before using GET_BITS, PEEK_BITS, or DROP_BITS. + * The variables get_buffer and bits_left are assumed to be locals, + * but the state struct might not be (jpeg_huff_decode needs this). + * CHECK_BIT_BUFFER(state,n,action); + * Ensure there are N bits in get_buffer; if suspend, take action. + * val = GET_BITS(n); + * Fetch next N bits. + * val = PEEK_BITS(n); + * Fetch next N bits without removing them from the buffer. + * DROP_BITS(n); + * Discard next N bits. + * The value N should be a simple variable, not an expression, because it + * is evaluated multiple times. + */ + +#define CHECK_BIT_BUFFER(state,nbits,action) \ + { if (bits_left < (nbits)) { \ + if (! jpeg_fill_bit_buffer(&(state),get_buffer,bits_left,nbits)) \ + { action; } \ + get_buffer = (state).get_buffer; bits_left = (state).bits_left; } } + +#define GET_BITS(nbits) \ + (((int) (get_buffer >> (bits_left -= (nbits)))) & ((1<<(nbits))-1)) + +#define PEEK_BITS(nbits) \ + (((int) (get_buffer >> (bits_left - (nbits)))) & ((1<<(nbits))-1)) + +#define DROP_BITS(nbits) \ + (bits_left -= (nbits)) + +/* Load up the bit buffer to a depth of at least nbits */ +EXTERN(boolean) jpeg_fill_bit_buffer + (bitread_working_state *state, register bit_buf_type get_buffer, + register int bits_left, int nbits); + + +/* + * Code for extracting next Huffman-coded symbol from input bit stream. + * Again, this is time-critical and we make the main paths be macros. + * + * We use a lookahead table to process codes of up to HUFF_LOOKAHEAD bits + * without looping. Usually, more than 95% of the Huffman codes will be 8 + * or fewer bits long. The few overlength codes are handled with a loop, + * which need not be inline code. + * + * Notes about the HUFF_DECODE macro: + * 1. Near the end of the data segment, we may fail to get enough bits + * for a lookahead. In that case, we do it the hard way. + * 2. If the lookahead table contains no entry, the next code must be + * more than HUFF_LOOKAHEAD bits long. + * 3. jpeg_huff_decode returns -1 if forced to suspend. + */ + +#define HUFF_DECODE(result,state,htbl,failaction,slowlabel) \ +{ register int nb, look; \ + if (bits_left < HUFF_LOOKAHEAD) { \ + if (! jpeg_fill_bit_buffer(&state,get_buffer,bits_left, 0)) {failaction;} \ + get_buffer = state.get_buffer; bits_left = state.bits_left; \ + if (bits_left < HUFF_LOOKAHEAD) { \ + nb = 1; goto slowlabel; \ + } \ + } \ + look = PEEK_BITS(HUFF_LOOKAHEAD); \ + if ((nb = (htbl->lookup[look] >> HUFF_LOOKAHEAD)) <= HUFF_LOOKAHEAD) { \ + DROP_BITS(nb); \ + result = htbl->lookup[look] & ((1 << HUFF_LOOKAHEAD) - 1); \ + } else { \ +slowlabel: \ + if ((result=jpeg_huff_decode(&state,get_buffer,bits_left,htbl,nb)) < 0) \ + { failaction; } \ + get_buffer = state.get_buffer; bits_left = state.bits_left; \ + } \ +} + +#define HUFF_DECODE_FAST(s,nb,htbl,slowlabel) \ + FILL_BIT_BUFFER_FAST; \ + s = PEEK_BITS(HUFF_LOOKAHEAD); \ + s = htbl->lookup[s]; \ + nb = s >> HUFF_LOOKAHEAD; \ + /* Pre-execute the common case of nb <= HUFF_LOOKAHEAD */ \ + DROP_BITS(nb); \ + s = s & ((1 << HUFF_LOOKAHEAD) - 1); \ + if (nb > HUFF_LOOKAHEAD) { \ + /* Equivalent of jpeg_huff_decode() */ \ + /* Don't use GET_BITS() here because we don't want to modify bits_left */ \ + s = (get_buffer >> bits_left) & ((1 << (nb)) - 1); \ + while (s > htbl->maxcode[nb]) { \ + s <<= 1; \ + s |= GET_BITS(1); \ + nb++; \ + } \ + if (nb > 16) \ + goto slowlabel; \ + s = htbl->pub->huffval[ (int) (s + htbl->valoffset[nb]) ]; \ + } + +/* Out-of-line case for Huffman code fetching */ +EXTERN(int) jpeg_huff_decode + (bitread_working_state *state, register bit_buf_type get_buffer, + register int bits_left, d_derived_tbl *htbl, int min_bits); diff --git a/media/libjpeg/jdinput.c b/media/libjpeg/jdinput.c new file mode 100644 index 000000000..32a6b424e --- /dev/null +++ b/media/libjpeg/jdinput.c @@ -0,0 +1,405 @@ +/* + * jdinput.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1997, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2010, 2016, D. R. Commander. + * Copyright (C) 2015, Google, Inc. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains input control logic for the JPEG decompressor. + * These routines are concerned with controlling the decompressor's input + * processing (marker reading and coefficient decoding). The actual input + * reading is done in jdmarker.c, jdhuff.c, and jdphuff.c. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jpegcomp.h" + + +/* Private state */ + +typedef struct { + struct jpeg_input_controller pub; /* public fields */ + + boolean inheaders; /* TRUE until first SOS is reached */ +} my_input_controller; + +typedef my_input_controller *my_inputctl_ptr; + + +/* Forward declarations */ +METHODDEF(int) consume_markers (j_decompress_ptr cinfo); + + +/* + * Routines to calculate various quantities related to the size of the image. + */ + +LOCAL(void) +initial_setup (j_decompress_ptr cinfo) +/* Called once, when first SOS marker is reached */ +{ + int ci; + jpeg_component_info *compptr; + + /* Make sure image isn't bigger than I can handle */ + if ((long) cinfo->image_height > (long) JPEG_MAX_DIMENSION || + (long) cinfo->image_width > (long) JPEG_MAX_DIMENSION) + ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) JPEG_MAX_DIMENSION); + + /* For now, precision must match compiled-in value... */ + if (cinfo->data_precision != BITS_IN_JSAMPLE) + ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision); + + /* Check that number of components won't exceed internal array sizes */ + if (cinfo->num_components > MAX_COMPONENTS) + ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->num_components, + MAX_COMPONENTS); + + /* Compute maximum sampling factors; check factor validity */ + cinfo->max_h_samp_factor = 1; + cinfo->max_v_samp_factor = 1; + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + if (compptr->h_samp_factor<=0 || compptr->h_samp_factor>MAX_SAMP_FACTOR || + compptr->v_samp_factor<=0 || compptr->v_samp_factor>MAX_SAMP_FACTOR) + ERREXIT(cinfo, JERR_BAD_SAMPLING); + cinfo->max_h_samp_factor = MAX(cinfo->max_h_samp_factor, + compptr->h_samp_factor); + cinfo->max_v_samp_factor = MAX(cinfo->max_v_samp_factor, + compptr->v_samp_factor); + } + +#if JPEG_LIB_VERSION >=80 + cinfo->block_size = DCTSIZE; + cinfo->natural_order = jpeg_natural_order; + cinfo->lim_Se = DCTSIZE2-1; +#endif + + /* We initialize DCT_scaled_size and min_DCT_scaled_size to DCTSIZE. + * In the full decompressor, this will be overridden by jdmaster.c; + * but in the transcoder, jdmaster.c is not used, so we must do it here. + */ +#if JPEG_LIB_VERSION >= 70 + cinfo->min_DCT_h_scaled_size = cinfo->min_DCT_v_scaled_size = DCTSIZE; +#else + cinfo->min_DCT_scaled_size = DCTSIZE; +#endif + + /* Compute dimensions of components */ + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { +#if JPEG_LIB_VERSION >= 70 + compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size = DCTSIZE; +#else + compptr->DCT_scaled_size = DCTSIZE; +#endif + /* Size in DCT blocks */ + compptr->width_in_blocks = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor, + (long) (cinfo->max_h_samp_factor * DCTSIZE)); + compptr->height_in_blocks = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor, + (long) (cinfo->max_v_samp_factor * DCTSIZE)); + /* Set the first and last MCU columns to decompress from multi-scan images. + * By default, decompress all of the MCU columns. + */ + cinfo->master->first_MCU_col[ci] = 0; + cinfo->master->last_MCU_col[ci] = compptr->width_in_blocks - 1; + /* downsampled_width and downsampled_height will also be overridden by + * jdmaster.c if we are doing full decompression. The transcoder library + * doesn't use these values, but the calling application might. + */ + /* Size in samples */ + compptr->downsampled_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor, + (long) cinfo->max_h_samp_factor); + compptr->downsampled_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor, + (long) cinfo->max_v_samp_factor); + /* Mark component needed, until color conversion says otherwise */ + compptr->component_needed = TRUE; + /* Mark no quantization table yet saved for component */ + compptr->quant_table = NULL; + } + + /* Compute number of fully interleaved MCU rows. */ + cinfo->total_iMCU_rows = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height, + (long) (cinfo->max_v_samp_factor*DCTSIZE)); + + /* Decide whether file contains multiple scans */ + if (cinfo->comps_in_scan < cinfo->num_components || cinfo->progressive_mode) + cinfo->inputctl->has_multiple_scans = TRUE; + else + cinfo->inputctl->has_multiple_scans = FALSE; +} + + +LOCAL(void) +per_scan_setup (j_decompress_ptr cinfo) +/* Do computations that are needed before processing a JPEG scan */ +/* cinfo->comps_in_scan and cinfo->cur_comp_info[] were set from SOS marker */ +{ + int ci, mcublks, tmp; + jpeg_component_info *compptr; + + if (cinfo->comps_in_scan == 1) { + + /* Noninterleaved (single-component) scan */ + compptr = cinfo->cur_comp_info[0]; + + /* Overall image size in MCUs */ + cinfo->MCUs_per_row = compptr->width_in_blocks; + cinfo->MCU_rows_in_scan = compptr->height_in_blocks; + + /* For noninterleaved scan, always one block per MCU */ + compptr->MCU_width = 1; + compptr->MCU_height = 1; + compptr->MCU_blocks = 1; + compptr->MCU_sample_width = compptr->_DCT_scaled_size; + compptr->last_col_width = 1; + /* For noninterleaved scans, it is convenient to define last_row_height + * as the number of block rows present in the last iMCU row. + */ + tmp = (int) (compptr->height_in_blocks % compptr->v_samp_factor); + if (tmp == 0) tmp = compptr->v_samp_factor; + compptr->last_row_height = tmp; + + /* Prepare array describing MCU composition */ + cinfo->blocks_in_MCU = 1; + cinfo->MCU_membership[0] = 0; + + } else { + + /* Interleaved (multi-component) scan */ + if (cinfo->comps_in_scan <= 0 || cinfo->comps_in_scan > MAX_COMPS_IN_SCAN) + ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->comps_in_scan, + MAX_COMPS_IN_SCAN); + + /* Overall image size in MCUs */ + cinfo->MCUs_per_row = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width, + (long) (cinfo->max_h_samp_factor*DCTSIZE)); + cinfo->MCU_rows_in_scan = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height, + (long) (cinfo->max_v_samp_factor*DCTSIZE)); + + cinfo->blocks_in_MCU = 0; + + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { + compptr = cinfo->cur_comp_info[ci]; + /* Sampling factors give # of blocks of component in each MCU */ + compptr->MCU_width = compptr->h_samp_factor; + compptr->MCU_height = compptr->v_samp_factor; + compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height; + compptr->MCU_sample_width = compptr->MCU_width * compptr->_DCT_scaled_size; + /* Figure number of non-dummy blocks in last MCU column & row */ + tmp = (int) (compptr->width_in_blocks % compptr->MCU_width); + if (tmp == 0) tmp = compptr->MCU_width; + compptr->last_col_width = tmp; + tmp = (int) (compptr->height_in_blocks % compptr->MCU_height); + if (tmp == 0) tmp = compptr->MCU_height; + compptr->last_row_height = tmp; + /* Prepare array describing MCU composition */ + mcublks = compptr->MCU_blocks; + if (cinfo->blocks_in_MCU + mcublks > D_MAX_BLOCKS_IN_MCU) + ERREXIT(cinfo, JERR_BAD_MCU_SIZE); + while (mcublks-- > 0) { + cinfo->MCU_membership[cinfo->blocks_in_MCU++] = ci; + } + } + + } +} + + +/* + * Save away a copy of the Q-table referenced by each component present + * in the current scan, unless already saved during a prior scan. + * + * In a multiple-scan JPEG file, the encoder could assign different components + * the same Q-table slot number, but change table definitions between scans + * so that each component uses a different Q-table. (The IJG encoder is not + * currently capable of doing this, but other encoders might.) Since we want + * to be able to dequantize all the components at the end of the file, this + * means that we have to save away the table actually used for each component. + * We do this by copying the table at the start of the first scan containing + * the component. + * The JPEG spec prohibits the encoder from changing the contents of a Q-table + * slot between scans of a component using that slot. If the encoder does so + * anyway, this decoder will simply use the Q-table values that were current + * at the start of the first scan for the component. + * + * The decompressor output side looks only at the saved quant tables, + * not at the current Q-table slots. + */ + +LOCAL(void) +latch_quant_tables (j_decompress_ptr cinfo) +{ + int ci, qtblno; + jpeg_component_info *compptr; + JQUANT_TBL *qtbl; + + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { + compptr = cinfo->cur_comp_info[ci]; + /* No work if we already saved Q-table for this component */ + if (compptr->quant_table != NULL) + continue; + /* Make sure specified quantization table is present */ + qtblno = compptr->quant_tbl_no; + if (qtblno < 0 || qtblno >= NUM_QUANT_TBLS || + cinfo->quant_tbl_ptrs[qtblno] == NULL) + ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno); + /* OK, save away the quantization table */ + qtbl = (JQUANT_TBL *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(JQUANT_TBL)); + MEMCOPY(qtbl, cinfo->quant_tbl_ptrs[qtblno], sizeof(JQUANT_TBL)); + compptr->quant_table = qtbl; + } +} + + +/* + * Initialize the input modules to read a scan of compressed data. + * The first call to this is done by jdmaster.c after initializing + * the entire decompressor (during jpeg_start_decompress). + * Subsequent calls come from consume_markers, below. + */ + +METHODDEF(void) +start_input_pass (j_decompress_ptr cinfo) +{ + per_scan_setup(cinfo); + latch_quant_tables(cinfo); + (*cinfo->entropy->start_pass) (cinfo); + (*cinfo->coef->start_input_pass) (cinfo); + cinfo->inputctl->consume_input = cinfo->coef->consume_data; +} + + +/* + * Finish up after inputting a compressed-data scan. + * This is called by the coefficient controller after it's read all + * the expected data of the scan. + */ + +METHODDEF(void) +finish_input_pass (j_decompress_ptr cinfo) +{ + cinfo->inputctl->consume_input = consume_markers; +} + + +/* + * Read JPEG markers before, between, or after compressed-data scans. + * Change state as necessary when a new scan is reached. + * Return value is JPEG_SUSPENDED, JPEG_REACHED_SOS, or JPEG_REACHED_EOI. + * + * The consume_input method pointer points either here or to the + * coefficient controller's consume_data routine, depending on whether + * we are reading a compressed data segment or inter-segment markers. + */ + +METHODDEF(int) +consume_markers (j_decompress_ptr cinfo) +{ + my_inputctl_ptr inputctl = (my_inputctl_ptr) cinfo->inputctl; + int val; + + if (inputctl->pub.eoi_reached) /* After hitting EOI, read no further */ + return JPEG_REACHED_EOI; + + val = (*cinfo->marker->read_markers) (cinfo); + + switch (val) { + case JPEG_REACHED_SOS: /* Found SOS */ + if (inputctl->inheaders) { /* 1st SOS */ + initial_setup(cinfo); + inputctl->inheaders = FALSE; + /* Note: start_input_pass must be called by jdmaster.c + * before any more input can be consumed. jdapimin.c is + * responsible for enforcing this sequencing. + */ + } else { /* 2nd or later SOS marker */ + if (! inputctl->pub.has_multiple_scans) + ERREXIT(cinfo, JERR_EOI_EXPECTED); /* Oops, I wasn't expecting this! */ + start_input_pass(cinfo); + } + break; + case JPEG_REACHED_EOI: /* Found EOI */ + inputctl->pub.eoi_reached = TRUE; + if (inputctl->inheaders) { /* Tables-only datastream, apparently */ + if (cinfo->marker->saw_SOF) + ERREXIT(cinfo, JERR_SOF_NO_SOS); + } else { + /* Prevent infinite loop in coef ctlr's decompress_data routine + * if user set output_scan_number larger than number of scans. + */ + if (cinfo->output_scan_number > cinfo->input_scan_number) + cinfo->output_scan_number = cinfo->input_scan_number; + } + break; + case JPEG_SUSPENDED: + break; + } + + return val; +} + + +/* + * Reset state to begin a fresh datastream. + */ + +METHODDEF(void) +reset_input_controller (j_decompress_ptr cinfo) +{ + my_inputctl_ptr inputctl = (my_inputctl_ptr) cinfo->inputctl; + + inputctl->pub.consume_input = consume_markers; + inputctl->pub.has_multiple_scans = FALSE; /* "unknown" would be better */ + inputctl->pub.eoi_reached = FALSE; + inputctl->inheaders = TRUE; + /* Reset other modules */ + (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo); + (*cinfo->marker->reset_marker_reader) (cinfo); + /* Reset progression state -- would be cleaner if entropy decoder did this */ + cinfo->coef_bits = NULL; +} + + +/* + * Initialize the input controller module. + * This is called only once, when the decompression object is created. + */ + +GLOBAL(void) +jinit_input_controller (j_decompress_ptr cinfo) +{ + my_inputctl_ptr inputctl; + + /* Create subobject in permanent pool */ + inputctl = (my_inputctl_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT, + sizeof(my_input_controller)); + cinfo->inputctl = (struct jpeg_input_controller *) inputctl; + /* Initialize method pointers */ + inputctl->pub.consume_input = consume_markers; + inputctl->pub.reset_input_controller = reset_input_controller; + inputctl->pub.start_input_pass = start_input_pass; + inputctl->pub.finish_input_pass = finish_input_pass; + /* Initialize state: can't use reset_input_controller since we don't + * want to try to reset other modules yet. + */ + inputctl->pub.has_multiple_scans = FALSE; /* "unknown" would be better */ + inputctl->pub.eoi_reached = FALSE; + inputctl->inheaders = TRUE; +} diff --git a/media/libjpeg/jdmainct.c b/media/libjpeg/jdmainct.c new file mode 100644 index 000000000..ebb069b0f --- /dev/null +++ b/media/libjpeg/jdmainct.c @@ -0,0 +1,456 @@ +/* + * jdmainct.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1994-1996, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2010, 2016, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains the main buffer controller for decompression. + * The main buffer lies between the JPEG decompressor proper and the + * post-processor; it holds downsampled data in the JPEG colorspace. + * + * Note that this code is bypassed in raw-data mode, since the application + * supplies the equivalent of the main buffer in that case. + */ + +#include "jinclude.h" +#include "jdmainct.h" + + +/* + * In the current system design, the main buffer need never be a full-image + * buffer; any full-height buffers will be found inside the coefficient or + * postprocessing controllers. Nonetheless, the main controller is not + * trivial. Its responsibility is to provide context rows for upsampling/ + * rescaling, and doing this in an efficient fashion is a bit tricky. + * + * Postprocessor input data is counted in "row groups". A row group + * is defined to be (v_samp_factor * DCT_scaled_size / min_DCT_scaled_size) + * sample rows of each component. (We require DCT_scaled_size values to be + * chosen such that these numbers are integers. In practice DCT_scaled_size + * values will likely be powers of two, so we actually have the stronger + * condition that DCT_scaled_size / min_DCT_scaled_size is an integer.) + * Upsampling will typically produce max_v_samp_factor pixel rows from each + * row group (times any additional scale factor that the upsampler is + * applying). + * + * The coefficient controller will deliver data to us one iMCU row at a time; + * each iMCU row contains v_samp_factor * DCT_scaled_size sample rows, or + * exactly min_DCT_scaled_size row groups. (This amount of data corresponds + * to one row of MCUs when the image is fully interleaved.) Note that the + * number of sample rows varies across components, but the number of row + * groups does not. Some garbage sample rows may be included in the last iMCU + * row at the bottom of the image. + * + * Depending on the vertical scaling algorithm used, the upsampler may need + * access to the sample row(s) above and below its current input row group. + * The upsampler is required to set need_context_rows TRUE at global selection + * time if so. When need_context_rows is FALSE, this controller can simply + * obtain one iMCU row at a time from the coefficient controller and dole it + * out as row groups to the postprocessor. + * + * When need_context_rows is TRUE, this controller guarantees that the buffer + * passed to postprocessing contains at least one row group's worth of samples + * above and below the row group(s) being processed. Note that the context + * rows "above" the first passed row group appear at negative row offsets in + * the passed buffer. At the top and bottom of the image, the required + * context rows are manufactured by duplicating the first or last real sample + * row; this avoids having special cases in the upsampling inner loops. + * + * The amount of context is fixed at one row group just because that's a + * convenient number for this controller to work with. The existing + * upsamplers really only need one sample row of context. An upsampler + * supporting arbitrary output rescaling might wish for more than one row + * group of context when shrinking the image; tough, we don't handle that. + * (This is justified by the assumption that downsizing will be handled mostly + * by adjusting the DCT_scaled_size values, so that the actual scale factor at + * the upsample step needn't be much less than one.) + * + * To provide the desired context, we have to retain the last two row groups + * of one iMCU row while reading in the next iMCU row. (The last row group + * can't be processed until we have another row group for its below-context, + * and so we have to save the next-to-last group too for its above-context.) + * We could do this most simply by copying data around in our buffer, but + * that'd be very slow. We can avoid copying any data by creating a rather + * strange pointer structure. Here's how it works. We allocate a workspace + * consisting of M+2 row groups (where M = min_DCT_scaled_size is the number + * of row groups per iMCU row). We create two sets of redundant pointers to + * the workspace. Labeling the physical row groups 0 to M+1, the synthesized + * pointer lists look like this: + * M+1 M-1 + * master pointer --> 0 master pointer --> 0 + * 1 1 + * ... ... + * M-3 M-3 + * M-2 M + * M-1 M+1 + * M M-2 + * M+1 M-1 + * 0 0 + * We read alternate iMCU rows using each master pointer; thus the last two + * row groups of the previous iMCU row remain un-overwritten in the workspace. + * The pointer lists are set up so that the required context rows appear to + * be adjacent to the proper places when we pass the pointer lists to the + * upsampler. + * + * The above pictures describe the normal state of the pointer lists. + * At top and bottom of the image, we diddle the pointer lists to duplicate + * the first or last sample row as necessary (this is cheaper than copying + * sample rows around). + * + * This scheme breaks down if M < 2, ie, min_DCT_scaled_size is 1. In that + * situation each iMCU row provides only one row group so the buffering logic + * must be different (eg, we must read two iMCU rows before we can emit the + * first row group). For now, we simply do not support providing context + * rows when min_DCT_scaled_size is 1. That combination seems unlikely to + * be worth providing --- if someone wants a 1/8th-size preview, they probably + * want it quick and dirty, so a context-free upsampler is sufficient. + */ + + +/* Forward declarations */ +METHODDEF(void) process_data_simple_main + (j_decompress_ptr cinfo, JSAMPARRAY output_buf, + JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail); +METHODDEF(void) process_data_context_main + (j_decompress_ptr cinfo, JSAMPARRAY output_buf, + JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail); +#ifdef QUANT_2PASS_SUPPORTED +METHODDEF(void) process_data_crank_post + (j_decompress_ptr cinfo, JSAMPARRAY output_buf, + JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail); +#endif + + +LOCAL(void) +alloc_funny_pointers (j_decompress_ptr cinfo) +/* Allocate space for the funny pointer lists. + * This is done only once, not once per pass. + */ +{ + my_main_ptr main_ptr = (my_main_ptr) cinfo->main; + int ci, rgroup; + int M = cinfo->_min_DCT_scaled_size; + jpeg_component_info *compptr; + JSAMPARRAY xbuf; + + /* Get top-level space for component array pointers. + * We alloc both arrays with one call to save a few cycles. + */ + main_ptr->xbuffer[0] = (JSAMPIMAGE) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + cinfo->num_components * 2 * sizeof(JSAMPARRAY)); + main_ptr->xbuffer[1] = main_ptr->xbuffer[0] + cinfo->num_components; + + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) / + cinfo->_min_DCT_scaled_size; /* height of a row group of component */ + /* Get space for pointer lists --- M+4 row groups in each list. + * We alloc both pointer lists with one call to save a few cycles. + */ + xbuf = (JSAMPARRAY) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + 2 * (rgroup * (M + 4)) * sizeof(JSAMPROW)); + xbuf += rgroup; /* want one row group at negative offsets */ + main_ptr->xbuffer[0][ci] = xbuf; + xbuf += rgroup * (M + 4); + main_ptr->xbuffer[1][ci] = xbuf; + } +} + + +LOCAL(void) +make_funny_pointers (j_decompress_ptr cinfo) +/* Create the funny pointer lists discussed in the comments above. + * The actual workspace is already allocated (in main_ptr->buffer), + * and the space for the pointer lists is allocated too. + * This routine just fills in the curiously ordered lists. + * This will be repeated at the beginning of each pass. + */ +{ + my_main_ptr main_ptr = (my_main_ptr) cinfo->main; + int ci, i, rgroup; + int M = cinfo->_min_DCT_scaled_size; + jpeg_component_info *compptr; + JSAMPARRAY buf, xbuf0, xbuf1; + + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) / + cinfo->_min_DCT_scaled_size; /* height of a row group of component */ + xbuf0 = main_ptr->xbuffer[0][ci]; + xbuf1 = main_ptr->xbuffer[1][ci]; + /* First copy the workspace pointers as-is */ + buf = main_ptr->buffer[ci]; + for (i = 0; i < rgroup * (M + 2); i++) { + xbuf0[i] = xbuf1[i] = buf[i]; + } + /* In the second list, put the last four row groups in swapped order */ + for (i = 0; i < rgroup * 2; i++) { + xbuf1[rgroup*(M-2) + i] = buf[rgroup*M + i]; + xbuf1[rgroup*M + i] = buf[rgroup*(M-2) + i]; + } + /* The wraparound pointers at top and bottom will be filled later + * (see set_wraparound_pointers, below). Initially we want the "above" + * pointers to duplicate the first actual data line. This only needs + * to happen in xbuffer[0]. + */ + for (i = 0; i < rgroup; i++) { + xbuf0[i - rgroup] = xbuf0[0]; + } + } +} + + +LOCAL(void) +set_bottom_pointers (j_decompress_ptr cinfo) +/* Change the pointer lists to duplicate the last sample row at the bottom + * of the image. whichptr indicates which xbuffer holds the final iMCU row. + * Also sets rowgroups_avail to indicate number of nondummy row groups in row. + */ +{ + my_main_ptr main_ptr = (my_main_ptr) cinfo->main; + int ci, i, rgroup, iMCUheight, rows_left; + jpeg_component_info *compptr; + JSAMPARRAY xbuf; + + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + /* Count sample rows in one iMCU row and in one row group */ + iMCUheight = compptr->v_samp_factor * compptr->_DCT_scaled_size; + rgroup = iMCUheight / cinfo->_min_DCT_scaled_size; + /* Count nondummy sample rows remaining for this component */ + rows_left = (int) (compptr->downsampled_height % (JDIMENSION) iMCUheight); + if (rows_left == 0) rows_left = iMCUheight; + /* Count nondummy row groups. Should get same answer for each component, + * so we need only do it once. + */ + if (ci == 0) { + main_ptr->rowgroups_avail = (JDIMENSION) ((rows_left-1) / rgroup + 1); + } + /* Duplicate the last real sample row rgroup*2 times; this pads out the + * last partial rowgroup and ensures at least one full rowgroup of context. + */ + xbuf = main_ptr->xbuffer[main_ptr->whichptr][ci]; + for (i = 0; i < rgroup * 2; i++) { + xbuf[rows_left + i] = xbuf[rows_left-1]; + } + } +} + + +/* + * Initialize for a processing pass. + */ + +METHODDEF(void) +start_pass_main (j_decompress_ptr cinfo, J_BUF_MODE pass_mode) +{ + my_main_ptr main_ptr = (my_main_ptr) cinfo->main; + + switch (pass_mode) { + case JBUF_PASS_THRU: + if (cinfo->upsample->need_context_rows) { + main_ptr->pub.process_data = process_data_context_main; + make_funny_pointers(cinfo); /* Create the xbuffer[] lists */ + main_ptr->whichptr = 0; /* Read first iMCU row into xbuffer[0] */ + main_ptr->context_state = CTX_PREPARE_FOR_IMCU; + main_ptr->iMCU_row_ctr = 0; + } else { + /* Simple case with no context needed */ + main_ptr->pub.process_data = process_data_simple_main; + } + main_ptr->buffer_full = FALSE; /* Mark buffer empty */ + main_ptr->rowgroup_ctr = 0; + break; +#ifdef QUANT_2PASS_SUPPORTED + case JBUF_CRANK_DEST: + /* For last pass of 2-pass quantization, just crank the postprocessor */ + main_ptr->pub.process_data = process_data_crank_post; + break; +#endif + default: + ERREXIT(cinfo, JERR_BAD_BUFFER_MODE); + break; + } +} + + +/* + * Process some data. + * This handles the simple case where no context is required. + */ + +METHODDEF(void) +process_data_simple_main (j_decompress_ptr cinfo, + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail) +{ + my_main_ptr main_ptr = (my_main_ptr) cinfo->main; + JDIMENSION rowgroups_avail; + + /* Read input data if we haven't filled the main buffer yet */ + if (! main_ptr->buffer_full) { + if (! (*cinfo->coef->decompress_data) (cinfo, main_ptr->buffer)) + return; /* suspension forced, can do nothing more */ + main_ptr->buffer_full = TRUE; /* OK, we have an iMCU row to work with */ + } + + /* There are always min_DCT_scaled_size row groups in an iMCU row. */ + rowgroups_avail = (JDIMENSION) cinfo->_min_DCT_scaled_size; + /* Note: at the bottom of the image, we may pass extra garbage row groups + * to the postprocessor. The postprocessor has to check for bottom + * of image anyway (at row resolution), so no point in us doing it too. + */ + + /* Feed the postprocessor */ + (*cinfo->post->post_process_data) (cinfo, main_ptr->buffer, + &main_ptr->rowgroup_ctr, rowgroups_avail, + output_buf, out_row_ctr, out_rows_avail); + + /* Has postprocessor consumed all the data yet? If so, mark buffer empty */ + if (main_ptr->rowgroup_ctr >= rowgroups_avail) { + main_ptr->buffer_full = FALSE; + main_ptr->rowgroup_ctr = 0; + } +} + + +/* + * Process some data. + * This handles the case where context rows must be provided. + */ + +METHODDEF(void) +process_data_context_main (j_decompress_ptr cinfo, + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail) +{ + my_main_ptr main_ptr = (my_main_ptr) cinfo->main; + + /* Read input data if we haven't filled the main buffer yet */ + if (! main_ptr->buffer_full) { + if (! (*cinfo->coef->decompress_data) (cinfo, + main_ptr->xbuffer[main_ptr->whichptr])) + return; /* suspension forced, can do nothing more */ + main_ptr->buffer_full = TRUE; /* OK, we have an iMCU row to work with */ + main_ptr->iMCU_row_ctr++; /* count rows received */ + } + + /* Postprocessor typically will not swallow all the input data it is handed + * in one call (due to filling the output buffer first). Must be prepared + * to exit and restart. This switch lets us keep track of how far we got. + * Note that each case falls through to the next on successful completion. + */ + switch (main_ptr->context_state) { + case CTX_POSTPONED_ROW: + /* Call postprocessor using previously set pointers for postponed row */ + (*cinfo->post->post_process_data) (cinfo, main_ptr->xbuffer[main_ptr->whichptr], + &main_ptr->rowgroup_ctr, main_ptr->rowgroups_avail, + output_buf, out_row_ctr, out_rows_avail); + if (main_ptr->rowgroup_ctr < main_ptr->rowgroups_avail) + return; /* Need to suspend */ + main_ptr->context_state = CTX_PREPARE_FOR_IMCU; + if (*out_row_ctr >= out_rows_avail) + return; /* Postprocessor exactly filled output buf */ + /*FALLTHROUGH*/ + case CTX_PREPARE_FOR_IMCU: + /* Prepare to process first M-1 row groups of this iMCU row */ + main_ptr->rowgroup_ctr = 0; + main_ptr->rowgroups_avail = (JDIMENSION) (cinfo->_min_DCT_scaled_size - 1); + /* Check for bottom of image: if so, tweak pointers to "duplicate" + * the last sample row, and adjust rowgroups_avail to ignore padding rows. + */ + if (main_ptr->iMCU_row_ctr == cinfo->total_iMCU_rows) + set_bottom_pointers(cinfo); + main_ptr->context_state = CTX_PROCESS_IMCU; + /*FALLTHROUGH*/ + case CTX_PROCESS_IMCU: + /* Call postprocessor using previously set pointers */ + (*cinfo->post->post_process_data) (cinfo, main_ptr->xbuffer[main_ptr->whichptr], + &main_ptr->rowgroup_ctr, main_ptr->rowgroups_avail, + output_buf, out_row_ctr, out_rows_avail); + if (main_ptr->rowgroup_ctr < main_ptr->rowgroups_avail) + return; /* Need to suspend */ + /* After the first iMCU, change wraparound pointers to normal state */ + if (main_ptr->iMCU_row_ctr == 1) + set_wraparound_pointers(cinfo); + /* Prepare to load new iMCU row using other xbuffer list */ + main_ptr->whichptr ^= 1; /* 0=>1 or 1=>0 */ + main_ptr->buffer_full = FALSE; + /* Still need to process last row group of this iMCU row, */ + /* which is saved at index M+1 of the other xbuffer */ + main_ptr->rowgroup_ctr = (JDIMENSION) (cinfo->_min_DCT_scaled_size + 1); + main_ptr->rowgroups_avail = (JDIMENSION) (cinfo->_min_DCT_scaled_size + 2); + main_ptr->context_state = CTX_POSTPONED_ROW; + } +} + + +/* + * Process some data. + * Final pass of two-pass quantization: just call the postprocessor. + * Source data will be the postprocessor controller's internal buffer. + */ + +#ifdef QUANT_2PASS_SUPPORTED + +METHODDEF(void) +process_data_crank_post (j_decompress_ptr cinfo, + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail) +{ + (*cinfo->post->post_process_data) (cinfo, (JSAMPIMAGE) NULL, + (JDIMENSION *) NULL, (JDIMENSION) 0, + output_buf, out_row_ctr, out_rows_avail); +} + +#endif /* QUANT_2PASS_SUPPORTED */ + + +/* + * Initialize main buffer controller. + */ + +GLOBAL(void) +jinit_d_main_controller (j_decompress_ptr cinfo, boolean need_full_buffer) +{ + my_main_ptr main_ptr; + int ci, rgroup, ngroups; + jpeg_component_info *compptr; + + main_ptr = (my_main_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(my_main_controller)); + cinfo->main = (struct jpeg_d_main_controller *) main_ptr; + main_ptr->pub.start_pass = start_pass_main; + + if (need_full_buffer) /* shouldn't happen */ + ERREXIT(cinfo, JERR_BAD_BUFFER_MODE); + + /* Allocate the workspace. + * ngroups is the number of row groups we need. + */ + if (cinfo->upsample->need_context_rows) { + if (cinfo->_min_DCT_scaled_size < 2) /* unsupported, see comments above */ + ERREXIT(cinfo, JERR_NOTIMPL); + alloc_funny_pointers(cinfo); /* Alloc space for xbuffer[] lists */ + ngroups = cinfo->_min_DCT_scaled_size + 2; + } else { + ngroups = cinfo->_min_DCT_scaled_size; + } + + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) / + cinfo->_min_DCT_scaled_size; /* height of a row group of component */ + main_ptr->buffer[ci] = (*cinfo->mem->alloc_sarray) + ((j_common_ptr) cinfo, JPOOL_IMAGE, + compptr->width_in_blocks * compptr->_DCT_scaled_size, + (JDIMENSION) (rgroup * ngroups)); + } +} diff --git a/media/libjpeg/jdmainct.h b/media/libjpeg/jdmainct.h new file mode 100644 index 000000000..30903019c --- /dev/null +++ b/media/libjpeg/jdmainct.h @@ -0,0 +1,71 @@ +/* + * jdmainct.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1994-1996, Thomas G. Lane. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + */ + +#define JPEG_INTERNALS +#include "jpeglib.h" +#include "jpegcomp.h" + + +/* Private buffer controller object */ + +typedef struct { + struct jpeg_d_main_controller pub; /* public fields */ + + /* Pointer to allocated workspace (M or M+2 row groups). */ + JSAMPARRAY buffer[MAX_COMPONENTS]; + + boolean buffer_full; /* Have we gotten an iMCU row from decoder? */ + JDIMENSION rowgroup_ctr; /* counts row groups output to postprocessor */ + + /* Remaining fields are only used in the context case. */ + + /* These are the master pointers to the funny-order pointer lists. */ + JSAMPIMAGE xbuffer[2]; /* pointers to weird pointer lists */ + + int whichptr; /* indicates which pointer set is now in use */ + int context_state; /* process_data state machine status */ + JDIMENSION rowgroups_avail; /* row groups available to postprocessor */ + JDIMENSION iMCU_row_ctr; /* counts iMCU rows to detect image top/bot */ +} my_main_controller; + +typedef my_main_controller *my_main_ptr; + + +/* context_state values: */ +#define CTX_PREPARE_FOR_IMCU 0 /* need to prepare for MCU row */ +#define CTX_PROCESS_IMCU 1 /* feeding iMCU to postprocessor */ +#define CTX_POSTPONED_ROW 2 /* feeding postponed row group */ + + +LOCAL(void) +set_wraparound_pointers (j_decompress_ptr cinfo) +/* Set up the "wraparound" pointers at top and bottom of the pointer lists. + * This changes the pointer list state from top-of-image to the normal state. + */ +{ + my_main_ptr main_ptr = (my_main_ptr) cinfo->main; + int ci, i, rgroup; + int M = cinfo->_min_DCT_scaled_size; + jpeg_component_info *compptr; + JSAMPARRAY xbuf0, xbuf1; + + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) / + cinfo->_min_DCT_scaled_size; /* height of a row group of component */ + xbuf0 = main_ptr->xbuffer[0][ci]; + xbuf1 = main_ptr->xbuffer[1][ci]; + for (i = 0; i < rgroup; i++) { + xbuf0[i - rgroup] = xbuf0[rgroup*(M+1) + i]; + xbuf1[i - rgroup] = xbuf1[rgroup*(M+1) + i]; + xbuf0[rgroup*(M+2) + i] = xbuf0[i]; + xbuf1[rgroup*(M+2) + i] = xbuf1[i]; + } + } +} diff --git a/media/libjpeg/jdmarker.c b/media/libjpeg/jdmarker.c new file mode 100644 index 000000000..e3b612c9b --- /dev/null +++ b/media/libjpeg/jdmarker.c @@ -0,0 +1,1377 @@ +/* + * jdmarker.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1998, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2012, 2015, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains routines to decode JPEG datastream markers. + * Most of the complexity arises from our desire to support input + * suspension: if not all of the data for a marker is available, + * we must exit back to the application. On resumption, we reprocess + * the marker. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" + + +typedef enum { /* JPEG marker codes */ + M_SOF0 = 0xc0, + M_SOF1 = 0xc1, + M_SOF2 = 0xc2, + M_SOF3 = 0xc3, + + M_SOF5 = 0xc5, + M_SOF6 = 0xc6, + M_SOF7 = 0xc7, + + M_JPG = 0xc8, + M_SOF9 = 0xc9, + M_SOF10 = 0xca, + M_SOF11 = 0xcb, + + M_SOF13 = 0xcd, + M_SOF14 = 0xce, + M_SOF15 = 0xcf, + + M_DHT = 0xc4, + + M_DAC = 0xcc, + + M_RST0 = 0xd0, + M_RST1 = 0xd1, + M_RST2 = 0xd2, + M_RST3 = 0xd3, + M_RST4 = 0xd4, + M_RST5 = 0xd5, + M_RST6 = 0xd6, + M_RST7 = 0xd7, + + M_SOI = 0xd8, + M_EOI = 0xd9, + M_SOS = 0xda, + M_DQT = 0xdb, + M_DNL = 0xdc, + M_DRI = 0xdd, + M_DHP = 0xde, + M_EXP = 0xdf, + + M_APP0 = 0xe0, + M_APP1 = 0xe1, + M_APP2 = 0xe2, + M_APP3 = 0xe3, + M_APP4 = 0xe4, + M_APP5 = 0xe5, + M_APP6 = 0xe6, + M_APP7 = 0xe7, + M_APP8 = 0xe8, + M_APP9 = 0xe9, + M_APP10 = 0xea, + M_APP11 = 0xeb, + M_APP12 = 0xec, + M_APP13 = 0xed, + M_APP14 = 0xee, + M_APP15 = 0xef, + + M_JPG0 = 0xf0, + M_JPG13 = 0xfd, + M_COM = 0xfe, + + M_TEM = 0x01, + + M_ERROR = 0x100 +} JPEG_MARKER; + + +/* Private state */ + +typedef struct { + struct jpeg_marker_reader pub; /* public fields */ + + /* Application-overridable marker processing methods */ + jpeg_marker_parser_method process_COM; + jpeg_marker_parser_method process_APPn[16]; + + /* Limit on marker data length to save for each marker type */ + unsigned int length_limit_COM; + unsigned int length_limit_APPn[16]; + + /* Status of COM/APPn marker saving */ + jpeg_saved_marker_ptr cur_marker; /* NULL if not processing a marker */ + unsigned int bytes_read; /* data bytes read so far in marker */ + /* Note: cur_marker is not linked into marker_list until it's all read. */ +} my_marker_reader; + +typedef my_marker_reader *my_marker_ptr; + + +/* + * Macros for fetching data from the data source module. + * + * At all times, cinfo->src->next_input_byte and ->bytes_in_buffer reflect + * the current restart point; we update them only when we have reached a + * suitable place to restart if a suspension occurs. + */ + +/* Declare and initialize local copies of input pointer/count */ +#define INPUT_VARS(cinfo) \ + struct jpeg_source_mgr *datasrc = (cinfo)->src; \ + const JOCTET *next_input_byte = datasrc->next_input_byte; \ + size_t bytes_in_buffer = datasrc->bytes_in_buffer + +/* Unload the local copies --- do this only at a restart boundary */ +#define INPUT_SYNC(cinfo) \ + ( datasrc->next_input_byte = next_input_byte, \ + datasrc->bytes_in_buffer = bytes_in_buffer ) + +/* Reload the local copies --- used only in MAKE_BYTE_AVAIL */ +#define INPUT_RELOAD(cinfo) \ + ( next_input_byte = datasrc->next_input_byte, \ + bytes_in_buffer = datasrc->bytes_in_buffer ) + +/* Internal macro for INPUT_BYTE and INPUT_2BYTES: make a byte available. + * Note we do *not* do INPUT_SYNC before calling fill_input_buffer, + * but we must reload the local copies after a successful fill. + */ +#define MAKE_BYTE_AVAIL(cinfo,action) \ + if (bytes_in_buffer == 0) { \ + if (! (*datasrc->fill_input_buffer) (cinfo)) \ + { action; } \ + INPUT_RELOAD(cinfo); \ + } + +/* Read a byte into variable V. + * If must suspend, take the specified action (typically "return FALSE"). + */ +#define INPUT_BYTE(cinfo,V,action) \ + MAKESTMT( MAKE_BYTE_AVAIL(cinfo,action); \ + bytes_in_buffer--; \ + V = GETJOCTET(*next_input_byte++); ) + +/* As above, but read two bytes interpreted as an unsigned 16-bit integer. + * V should be declared unsigned int or perhaps JLONG. + */ +#define INPUT_2BYTES(cinfo,V,action) \ + MAKESTMT( MAKE_BYTE_AVAIL(cinfo,action); \ + bytes_in_buffer--; \ + V = ((unsigned int) GETJOCTET(*next_input_byte++)) << 8; \ + MAKE_BYTE_AVAIL(cinfo,action); \ + bytes_in_buffer--; \ + V += GETJOCTET(*next_input_byte++); ) + + +/* + * Routines to process JPEG markers. + * + * Entry condition: JPEG marker itself has been read and its code saved + * in cinfo->unread_marker; input restart point is just after the marker. + * + * Exit: if return TRUE, have read and processed any parameters, and have + * updated the restart point to point after the parameters. + * If return FALSE, was forced to suspend before reaching end of + * marker parameters; restart point has not been moved. Same routine + * will be called again after application supplies more input data. + * + * This approach to suspension assumes that all of a marker's parameters + * can fit into a single input bufferload. This should hold for "normal" + * markers. Some COM/APPn markers might have large parameter segments + * that might not fit. If we are simply dropping such a marker, we use + * skip_input_data to get past it, and thereby put the problem on the + * source manager's shoulders. If we are saving the marker's contents + * into memory, we use a slightly different convention: when forced to + * suspend, the marker processor updates the restart point to the end of + * what it's consumed (ie, the end of the buffer) before returning FALSE. + * On resumption, cinfo->unread_marker still contains the marker code, + * but the data source will point to the next chunk of marker data. + * The marker processor must retain internal state to deal with this. + * + * Note that we don't bother to avoid duplicate trace messages if a + * suspension occurs within marker parameters. Other side effects + * require more care. + */ + + +LOCAL(boolean) +get_soi (j_decompress_ptr cinfo) +/* Process an SOI marker */ +{ + int i; + + TRACEMS(cinfo, 1, JTRC_SOI); + + if (cinfo->marker->saw_SOI) + ERREXIT(cinfo, JERR_SOI_DUPLICATE); + + /* Reset all parameters that are defined to be reset by SOI */ + + for (i = 0; i < NUM_ARITH_TBLS; i++) { + cinfo->arith_dc_L[i] = 0; + cinfo->arith_dc_U[i] = 1; + cinfo->arith_ac_K[i] = 5; + } + cinfo->restart_interval = 0; + + /* Set initial assumptions for colorspace etc */ + + cinfo->jpeg_color_space = JCS_UNKNOWN; + cinfo->CCIR601_sampling = FALSE; /* Assume non-CCIR sampling??? */ + + cinfo->saw_JFIF_marker = FALSE; + cinfo->JFIF_major_version = 1; /* set default JFIF APP0 values */ + cinfo->JFIF_minor_version = 1; + cinfo->density_unit = 0; + cinfo->X_density = 1; + cinfo->Y_density = 1; + cinfo->saw_Adobe_marker = FALSE; + cinfo->Adobe_transform = 0; + + cinfo->marker->saw_SOI = TRUE; + + return TRUE; +} + + +LOCAL(boolean) +get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith) +/* Process a SOFn marker */ +{ + JLONG length; + int c, ci; + jpeg_component_info *compptr; + INPUT_VARS(cinfo); + + cinfo->progressive_mode = is_prog; + cinfo->arith_code = is_arith; + + INPUT_2BYTES(cinfo, length, return FALSE); + + INPUT_BYTE(cinfo, cinfo->data_precision, return FALSE); + INPUT_2BYTES(cinfo, cinfo->image_height, return FALSE); + INPUT_2BYTES(cinfo, cinfo->image_width, return FALSE); + INPUT_BYTE(cinfo, cinfo->num_components, return FALSE); + + length -= 8; + + TRACEMS4(cinfo, 1, JTRC_SOF, cinfo->unread_marker, + (int) cinfo->image_width, (int) cinfo->image_height, + cinfo->num_components); + + if (cinfo->marker->saw_SOF) + ERREXIT(cinfo, JERR_SOF_DUPLICATE); + + /* We don't support files in which the image height is initially specified */ + /* as 0 and is later redefined by DNL. As long as we have to check that, */ + /* might as well have a general sanity check. */ + if (cinfo->image_height <= 0 || cinfo->image_width <= 0 + || cinfo->num_components <= 0) + ERREXIT(cinfo, JERR_EMPTY_IMAGE); + + if (length != (cinfo->num_components * 3)) + ERREXIT(cinfo, JERR_BAD_LENGTH); + + if (cinfo->comp_info == NULL) /* do only once, even if suspend */ + cinfo->comp_info = (jpeg_component_info *) (*cinfo->mem->alloc_small) + ((j_common_ptr) cinfo, JPOOL_IMAGE, + cinfo->num_components * sizeof(jpeg_component_info)); + + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + compptr->component_index = ci; + INPUT_BYTE(cinfo, compptr->component_id, return FALSE); + INPUT_BYTE(cinfo, c, return FALSE); + compptr->h_samp_factor = (c >> 4) & 15; + compptr->v_samp_factor = (c ) & 15; + INPUT_BYTE(cinfo, compptr->quant_tbl_no, return FALSE); + + TRACEMS4(cinfo, 1, JTRC_SOF_COMPONENT, + compptr->component_id, compptr->h_samp_factor, + compptr->v_samp_factor, compptr->quant_tbl_no); + } + + cinfo->marker->saw_SOF = TRUE; + + INPUT_SYNC(cinfo); + return TRUE; +} + + +LOCAL(boolean) +get_sos (j_decompress_ptr cinfo) +/* Process a SOS marker */ +{ + JLONG length; + int i, ci, n, c, cc, pi; + jpeg_component_info *compptr; + INPUT_VARS(cinfo); + + if (! cinfo->marker->saw_SOF) + ERREXIT(cinfo, JERR_SOS_NO_SOF); + + INPUT_2BYTES(cinfo, length, return FALSE); + + INPUT_BYTE(cinfo, n, return FALSE); /* Number of components */ + + TRACEMS1(cinfo, 1, JTRC_SOS, n); + + if (length != (n * 2 + 6) || n < 1 || n > MAX_COMPS_IN_SCAN) + ERREXIT(cinfo, JERR_BAD_LENGTH); + + cinfo->comps_in_scan = n; + + /* Collect the component-spec parameters */ + + for (i = 0; i < MAX_COMPS_IN_SCAN; i++) + cinfo->cur_comp_info[i] = NULL; + + for (i = 0; i < n; i++) { + INPUT_BYTE(cinfo, cc, return FALSE); + INPUT_BYTE(cinfo, c, return FALSE); + + for (ci = 0, compptr = cinfo->comp_info; + ci < cinfo->num_components && ci < MAX_COMPS_IN_SCAN; + ci++, compptr++) { + if (cc == compptr->component_id && !cinfo->cur_comp_info[ci]) + goto id_found; + } + + ERREXIT1(cinfo, JERR_BAD_COMPONENT_ID, cc); + + id_found: + + cinfo->cur_comp_info[i] = compptr; + compptr->dc_tbl_no = (c >> 4) & 15; + compptr->ac_tbl_no = (c ) & 15; + + TRACEMS3(cinfo, 1, JTRC_SOS_COMPONENT, cc, + compptr->dc_tbl_no, compptr->ac_tbl_no); + + /* This CSi (cc) should differ from the previous CSi */ + for (pi = 0; pi < i; pi++) { + if (cinfo->cur_comp_info[pi] == compptr) { + ERREXIT1(cinfo, JERR_BAD_COMPONENT_ID, cc); + } + } + } + + /* Collect the additional scan parameters Ss, Se, Ah/Al. */ + INPUT_BYTE(cinfo, c, return FALSE); + cinfo->Ss = c; + INPUT_BYTE(cinfo, c, return FALSE); + cinfo->Se = c; + INPUT_BYTE(cinfo, c, return FALSE); + cinfo->Ah = (c >> 4) & 15; + cinfo->Al = (c ) & 15; + + TRACEMS4(cinfo, 1, JTRC_SOS_PARAMS, cinfo->Ss, cinfo->Se, + cinfo->Ah, cinfo->Al); + + /* Prepare to scan data & restart markers */ + cinfo->marker->next_restart_num = 0; + + /* Count another SOS marker */ + cinfo->input_scan_number++; + + INPUT_SYNC(cinfo); + return TRUE; +} + + +#ifdef D_ARITH_CODING_SUPPORTED + +LOCAL(boolean) +get_dac (j_decompress_ptr cinfo) +/* Process a DAC marker */ +{ + JLONG length; + int index, val; + INPUT_VARS(cinfo); + + INPUT_2BYTES(cinfo, length, return FALSE); + length -= 2; + + while (length > 0) { + INPUT_BYTE(cinfo, index, return FALSE); + INPUT_BYTE(cinfo, val, return FALSE); + + length -= 2; + + TRACEMS2(cinfo, 1, JTRC_DAC, index, val); + + if (index < 0 || index >= (2*NUM_ARITH_TBLS)) + ERREXIT1(cinfo, JERR_DAC_INDEX, index); + + if (index >= NUM_ARITH_TBLS) { /* define AC table */ + cinfo->arith_ac_K[index-NUM_ARITH_TBLS] = (UINT8) val; + } else { /* define DC table */ + cinfo->arith_dc_L[index] = (UINT8) (val & 0x0F); + cinfo->arith_dc_U[index] = (UINT8) (val >> 4); + if (cinfo->arith_dc_L[index] > cinfo->arith_dc_U[index]) + ERREXIT1(cinfo, JERR_DAC_VALUE, val); + } + } + + if (length != 0) + ERREXIT(cinfo, JERR_BAD_LENGTH); + + INPUT_SYNC(cinfo); + return TRUE; +} + +#else /* ! D_ARITH_CODING_SUPPORTED */ + +#define get_dac(cinfo) skip_variable(cinfo) + +#endif /* D_ARITH_CODING_SUPPORTED */ + + +LOCAL(boolean) +get_dht (j_decompress_ptr cinfo) +/* Process a DHT marker */ +{ + JLONG length; + UINT8 bits[17]; + UINT8 huffval[256]; + int i, index, count; + JHUFF_TBL **htblptr; + INPUT_VARS(cinfo); + + INPUT_2BYTES(cinfo, length, return FALSE); + length -= 2; + + while (length > 16) { + INPUT_BYTE(cinfo, index, return FALSE); + + TRACEMS1(cinfo, 1, JTRC_DHT, index); + + bits[0] = 0; + count = 0; + for (i = 1; i <= 16; i++) { + INPUT_BYTE(cinfo, bits[i], return FALSE); + count += bits[i]; + } + + length -= 1 + 16; + + TRACEMS8(cinfo, 2, JTRC_HUFFBITS, + bits[1], bits[2], bits[3], bits[4], + bits[5], bits[6], bits[7], bits[8]); + TRACEMS8(cinfo, 2, JTRC_HUFFBITS, + bits[9], bits[10], bits[11], bits[12], + bits[13], bits[14], bits[15], bits[16]); + + /* Here we just do minimal validation of the counts to avoid walking + * off the end of our table space. jdhuff.c will check more carefully. + */ + if (count > 256 || ((JLONG) count) > length) + ERREXIT(cinfo, JERR_BAD_HUFF_TABLE); + + for (i = 0; i < count; i++) + INPUT_BYTE(cinfo, huffval[i], return FALSE); + + MEMZERO(&huffval[count], (256 - count) * sizeof(UINT8)); + + length -= count; + + if (index & 0x10) { /* AC table definition */ + index -= 0x10; + if (index < 0 || index >= NUM_HUFF_TBLS) + ERREXIT1(cinfo, JERR_DHT_INDEX, index); + htblptr = &cinfo->ac_huff_tbl_ptrs[index]; + } else { /* DC table definition */ + if (index < 0 || index >= NUM_HUFF_TBLS) + ERREXIT1(cinfo, JERR_DHT_INDEX, index); + htblptr = &cinfo->dc_huff_tbl_ptrs[index]; + } + + if (*htblptr == NULL) + *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo); + + MEMCOPY((*htblptr)->bits, bits, sizeof((*htblptr)->bits)); + MEMCOPY((*htblptr)->huffval, huffval, sizeof((*htblptr)->huffval)); + } + + if (length != 0) + ERREXIT(cinfo, JERR_BAD_LENGTH); + + INPUT_SYNC(cinfo); + return TRUE; +} + + +LOCAL(boolean) +get_dqt (j_decompress_ptr cinfo) +/* Process a DQT marker */ +{ + JLONG length; + int n, i, prec; + unsigned int tmp; + JQUANT_TBL *quant_ptr; + INPUT_VARS(cinfo); + + INPUT_2BYTES(cinfo, length, return FALSE); + length -= 2; + + while (length > 0) { + INPUT_BYTE(cinfo, n, return FALSE); + prec = n >> 4; + n &= 0x0F; + + TRACEMS2(cinfo, 1, JTRC_DQT, n, prec); + + if (n >= NUM_QUANT_TBLS) + ERREXIT1(cinfo, JERR_DQT_INDEX, n); + + if (cinfo->quant_tbl_ptrs[n] == NULL) + cinfo->quant_tbl_ptrs[n] = jpeg_alloc_quant_table((j_common_ptr) cinfo); + quant_ptr = cinfo->quant_tbl_ptrs[n]; + + for (i = 0; i < DCTSIZE2; i++) { + if (prec) + INPUT_2BYTES(cinfo, tmp, return FALSE); + else + INPUT_BYTE(cinfo, tmp, return FALSE); + /* We convert the zigzag-order table to natural array order. */ + quant_ptr->quantval[jpeg_natural_order[i]] = (UINT16) tmp; + } + + if (cinfo->err->trace_level >= 2) { + for (i = 0; i < DCTSIZE2; i += 8) { + TRACEMS8(cinfo, 2, JTRC_QUANTVALS, + quant_ptr->quantval[i], quant_ptr->quantval[i+1], + quant_ptr->quantval[i+2], quant_ptr->quantval[i+3], + quant_ptr->quantval[i+4], quant_ptr->quantval[i+5], + quant_ptr->quantval[i+6], quant_ptr->quantval[i+7]); + } + } + + length -= DCTSIZE2+1; + if (prec) length -= DCTSIZE2; + } + + if (length != 0) + ERREXIT(cinfo, JERR_BAD_LENGTH); + + INPUT_SYNC(cinfo); + return TRUE; +} + + +LOCAL(boolean) +get_dri (j_decompress_ptr cinfo) +/* Process a DRI marker */ +{ + JLONG length; + unsigned int tmp; + INPUT_VARS(cinfo); + + INPUT_2BYTES(cinfo, length, return FALSE); + + if (length != 4) + ERREXIT(cinfo, JERR_BAD_LENGTH); + + INPUT_2BYTES(cinfo, tmp, return FALSE); + + TRACEMS1(cinfo, 1, JTRC_DRI, tmp); + + cinfo->restart_interval = tmp; + + INPUT_SYNC(cinfo); + return TRUE; +} + + +/* + * Routines for processing APPn and COM markers. + * These are either saved in memory or discarded, per application request. + * APP0 and APP14 are specially checked to see if they are + * JFIF and Adobe markers, respectively. + */ + +#define APP0_DATA_LEN 14 /* Length of interesting data in APP0 */ +#define APP14_DATA_LEN 12 /* Length of interesting data in APP14 */ +#define APPN_DATA_LEN 14 /* Must be the largest of the above!! */ + + +LOCAL(void) +examine_app0 (j_decompress_ptr cinfo, JOCTET *data, + unsigned int datalen, JLONG remaining) +/* Examine first few bytes from an APP0. + * Take appropriate action if it is a JFIF marker. + * datalen is # of bytes at data[], remaining is length of rest of marker data. + */ +{ + JLONG totallen = (JLONG) datalen + remaining; + + if (datalen >= APP0_DATA_LEN && + GETJOCTET(data[0]) == 0x4A && + GETJOCTET(data[1]) == 0x46 && + GETJOCTET(data[2]) == 0x49 && + GETJOCTET(data[3]) == 0x46 && + GETJOCTET(data[4]) == 0) { + /* Found JFIF APP0 marker: save info */ + cinfo->saw_JFIF_marker = TRUE; + cinfo->JFIF_major_version = GETJOCTET(data[5]); + cinfo->JFIF_minor_version = GETJOCTET(data[6]); + cinfo->density_unit = GETJOCTET(data[7]); + cinfo->X_density = (GETJOCTET(data[8]) << 8) + GETJOCTET(data[9]); + cinfo->Y_density = (GETJOCTET(data[10]) << 8) + GETJOCTET(data[11]); + /* Check version. + * Major version must be 1, anything else signals an incompatible change. + * (We used to treat this as an error, but now it's a nonfatal warning, + * because some bozo at Hijaak couldn't read the spec.) + * Minor version should be 0..2, but process anyway if newer. + */ + if (cinfo->JFIF_major_version != 1) + WARNMS2(cinfo, JWRN_JFIF_MAJOR, + cinfo->JFIF_major_version, cinfo->JFIF_minor_version); + /* Generate trace messages */ + TRACEMS5(cinfo, 1, JTRC_JFIF, + cinfo->JFIF_major_version, cinfo->JFIF_minor_version, + cinfo->X_density, cinfo->Y_density, cinfo->density_unit); + /* Validate thumbnail dimensions and issue appropriate messages */ + if (GETJOCTET(data[12]) | GETJOCTET(data[13])) + TRACEMS2(cinfo, 1, JTRC_JFIF_THUMBNAIL, + GETJOCTET(data[12]), GETJOCTET(data[13])); + totallen -= APP0_DATA_LEN; + if (totallen != + ((JLONG)GETJOCTET(data[12]) * (JLONG)GETJOCTET(data[13]) * (JLONG) 3)) + TRACEMS1(cinfo, 1, JTRC_JFIF_BADTHUMBNAILSIZE, (int) totallen); + } else if (datalen >= 6 && + GETJOCTET(data[0]) == 0x4A && + GETJOCTET(data[1]) == 0x46 && + GETJOCTET(data[2]) == 0x58 && + GETJOCTET(data[3]) == 0x58 && + GETJOCTET(data[4]) == 0) { + /* Found JFIF "JFXX" extension APP0 marker */ + /* The library doesn't actually do anything with these, + * but we try to produce a helpful trace message. + */ + switch (GETJOCTET(data[5])) { + case 0x10: + TRACEMS1(cinfo, 1, JTRC_THUMB_JPEG, (int) totallen); + break; + case 0x11: + TRACEMS1(cinfo, 1, JTRC_THUMB_PALETTE, (int) totallen); + break; + case 0x13: + TRACEMS1(cinfo, 1, JTRC_THUMB_RGB, (int) totallen); + break; + default: + TRACEMS2(cinfo, 1, JTRC_JFIF_EXTENSION, + GETJOCTET(data[5]), (int) totallen); + break; + } + } else { + /* Start of APP0 does not match "JFIF" or "JFXX", or too short */ + TRACEMS1(cinfo, 1, JTRC_APP0, (int) totallen); + } +} + + +LOCAL(void) +examine_app14 (j_decompress_ptr cinfo, JOCTET *data, + unsigned int datalen, JLONG remaining) +/* Examine first few bytes from an APP14. + * Take appropriate action if it is an Adobe marker. + * datalen is # of bytes at data[], remaining is length of rest of marker data. + */ +{ + unsigned int version, flags0, flags1, transform; + + if (datalen >= APP14_DATA_LEN && + GETJOCTET(data[0]) == 0x41 && + GETJOCTET(data[1]) == 0x64 && + GETJOCTET(data[2]) == 0x6F && + GETJOCTET(data[3]) == 0x62 && + GETJOCTET(data[4]) == 0x65) { + /* Found Adobe APP14 marker */ + version = (GETJOCTET(data[5]) << 8) + GETJOCTET(data[6]); + flags0 = (GETJOCTET(data[7]) << 8) + GETJOCTET(data[8]); + flags1 = (GETJOCTET(data[9]) << 8) + GETJOCTET(data[10]); + transform = GETJOCTET(data[11]); + TRACEMS4(cinfo, 1, JTRC_ADOBE, version, flags0, flags1, transform); + cinfo->saw_Adobe_marker = TRUE; + cinfo->Adobe_transform = (UINT8) transform; + } else { + /* Start of APP14 does not match "Adobe", or too short */ + TRACEMS1(cinfo, 1, JTRC_APP14, (int) (datalen + remaining)); + } +} + + +METHODDEF(boolean) +get_interesting_appn (j_decompress_ptr cinfo) +/* Process an APP0 or APP14 marker without saving it */ +{ + JLONG length; + JOCTET b[APPN_DATA_LEN]; + unsigned int i, numtoread; + INPUT_VARS(cinfo); + + INPUT_2BYTES(cinfo, length, return FALSE); + length -= 2; + + /* get the interesting part of the marker data */ + if (length >= APPN_DATA_LEN) + numtoread = APPN_DATA_LEN; + else if (length > 0) + numtoread = (unsigned int) length; + else + numtoread = 0; + for (i = 0; i < numtoread; i++) + INPUT_BYTE(cinfo, b[i], return FALSE); + length -= numtoread; + + /* process it */ + switch (cinfo->unread_marker) { + case M_APP0: + examine_app0(cinfo, (JOCTET *) b, numtoread, length); + break; + case M_APP14: + examine_app14(cinfo, (JOCTET *) b, numtoread, length); + break; + default: + /* can't get here unless jpeg_save_markers chooses wrong processor */ + ERREXIT1(cinfo, JERR_UNKNOWN_MARKER, cinfo->unread_marker); + break; + } + + /* skip any remaining data -- could be lots */ + INPUT_SYNC(cinfo); + if (length > 0) + (*cinfo->src->skip_input_data) (cinfo, (long) length); + + return TRUE; +} + + +#ifdef SAVE_MARKERS_SUPPORTED + +METHODDEF(boolean) +save_marker (j_decompress_ptr cinfo) +/* Save an APPn or COM marker into the marker list */ +{ + my_marker_ptr marker = (my_marker_ptr) cinfo->marker; + jpeg_saved_marker_ptr cur_marker = marker->cur_marker; + unsigned int bytes_read, data_length; + JOCTET *data; + JLONG length = 0; + INPUT_VARS(cinfo); + + if (cur_marker == NULL) { + /* begin reading a marker */ + INPUT_2BYTES(cinfo, length, return FALSE); + length -= 2; + if (length >= 0) { /* watch out for bogus length word */ + /* figure out how much we want to save */ + unsigned int limit; + if (cinfo->unread_marker == (int) M_COM) + limit = marker->length_limit_COM; + else + limit = marker->length_limit_APPn[cinfo->unread_marker - (int) M_APP0]; + if ((unsigned int) length < limit) + limit = (unsigned int) length; + /* allocate and initialize the marker item */ + cur_marker = (jpeg_saved_marker_ptr) + (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(struct jpeg_marker_struct) + limit); + cur_marker->next = NULL; + cur_marker->marker = (UINT8) cinfo->unread_marker; + cur_marker->original_length = (unsigned int) length; + cur_marker->data_length = limit; + /* data area is just beyond the jpeg_marker_struct */ + data = cur_marker->data = (JOCTET *) (cur_marker + 1); + marker->cur_marker = cur_marker; + marker->bytes_read = 0; + bytes_read = 0; + data_length = limit; + } else { + /* deal with bogus length word */ + bytes_read = data_length = 0; + data = NULL; + } + } else { + /* resume reading a marker */ + bytes_read = marker->bytes_read; + data_length = cur_marker->data_length; + data = cur_marker->data + bytes_read; + } + + while (bytes_read < data_length) { + INPUT_SYNC(cinfo); /* move the restart point to here */ + marker->bytes_read = bytes_read; + /* If there's not at least one byte in buffer, suspend */ + MAKE_BYTE_AVAIL(cinfo, return FALSE); + /* Copy bytes with reasonable rapidity */ + while (bytes_read < data_length && bytes_in_buffer > 0) { + *data++ = *next_input_byte++; + bytes_in_buffer--; + bytes_read++; + } + } + + /* Done reading what we want to read */ + if (cur_marker != NULL) { /* will be NULL if bogus length word */ + /* Add new marker to end of list */ + if (cinfo->marker_list == NULL) { + cinfo->marker_list = cur_marker; + } else { + jpeg_saved_marker_ptr prev = cinfo->marker_list; + while (prev->next != NULL) + prev = prev->next; + prev->next = cur_marker; + } + /* Reset pointer & calc remaining data length */ + data = cur_marker->data; + length = cur_marker->original_length - data_length; + } + /* Reset to initial state for next marker */ + marker->cur_marker = NULL; + + /* Process the marker if interesting; else just make a generic trace msg */ + switch (cinfo->unread_marker) { + case M_APP0: + examine_app0(cinfo, data, data_length, length); + break; + case M_APP14: + examine_app14(cinfo, data, data_length, length); + break; + default: + TRACEMS2(cinfo, 1, JTRC_MISC_MARKER, cinfo->unread_marker, + (int) (data_length + length)); + break; + } + + /* skip any remaining data -- could be lots */ + INPUT_SYNC(cinfo); /* do before skip_input_data */ + if (length > 0) + (*cinfo->src->skip_input_data) (cinfo, (long) length); + + return TRUE; +} + +#endif /* SAVE_MARKERS_SUPPORTED */ + + +METHODDEF(boolean) +skip_variable (j_decompress_ptr cinfo) +/* Skip over an unknown or uninteresting variable-length marker */ +{ + JLONG length; + INPUT_VARS(cinfo); + + INPUT_2BYTES(cinfo, length, return FALSE); + length -= 2; + + TRACEMS2(cinfo, 1, JTRC_MISC_MARKER, cinfo->unread_marker, (int) length); + + INPUT_SYNC(cinfo); /* do before skip_input_data */ + if (length > 0) + (*cinfo->src->skip_input_data) (cinfo, (long) length); + + return TRUE; +} + + +/* + * Find the next JPEG marker, save it in cinfo->unread_marker. + * Returns FALSE if had to suspend before reaching a marker; + * in that case cinfo->unread_marker is unchanged. + * + * Note that the result might not be a valid marker code, + * but it will never be 0 or FF. + */ + +LOCAL(boolean) +next_marker (j_decompress_ptr cinfo) +{ + int c; + INPUT_VARS(cinfo); + + for (;;) { + INPUT_BYTE(cinfo, c, return FALSE); + /* Skip any non-FF bytes. + * This may look a bit inefficient, but it will not occur in a valid file. + * We sync after each discarded byte so that a suspending data source + * can discard the byte from its buffer. + */ + while (c != 0xFF) { + cinfo->marker->discarded_bytes++; + INPUT_SYNC(cinfo); + INPUT_BYTE(cinfo, c, return FALSE); + } + /* This loop swallows any duplicate FF bytes. Extra FFs are legal as + * pad bytes, so don't count them in discarded_bytes. We assume there + * will not be so many consecutive FF bytes as to overflow a suspending + * data source's input buffer. + */ + do { + INPUT_BYTE(cinfo, c, return FALSE); + } while (c == 0xFF); + if (c != 0) + break; /* found a valid marker, exit loop */ + /* Reach here if we found a stuffed-zero data sequence (FF/00). + * Discard it and loop back to try again. + */ + cinfo->marker->discarded_bytes += 2; + INPUT_SYNC(cinfo); + } + + if (cinfo->marker->discarded_bytes != 0) { + WARNMS2(cinfo, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c); + cinfo->marker->discarded_bytes = 0; + } + + cinfo->unread_marker = c; + + INPUT_SYNC(cinfo); + return TRUE; +} + + +LOCAL(boolean) +first_marker (j_decompress_ptr cinfo) +/* Like next_marker, but used to obtain the initial SOI marker. */ +/* For this marker, we do not allow preceding garbage or fill; otherwise, + * we might well scan an entire input file before realizing it ain't JPEG. + * If an application wants to process non-JFIF files, it must seek to the + * SOI before calling the JPEG library. + */ +{ + int c, c2; + INPUT_VARS(cinfo); + + INPUT_BYTE(cinfo, c, return FALSE); + INPUT_BYTE(cinfo, c2, return FALSE); + if (c != 0xFF || c2 != (int) M_SOI) + ERREXIT2(cinfo, JERR_NO_SOI, c, c2); + + cinfo->unread_marker = c2; + + INPUT_SYNC(cinfo); + return TRUE; +} + + +/* + * Read markers until SOS or EOI. + * + * Returns same codes as are defined for jpeg_consume_input: + * JPEG_SUSPENDED, JPEG_REACHED_SOS, or JPEG_REACHED_EOI. + */ + +METHODDEF(int) +read_markers (j_decompress_ptr cinfo) +{ + /* Outer loop repeats once for each marker. */ + for (;;) { + /* Collect the marker proper, unless we already did. */ + /* NB: first_marker() enforces the requirement that SOI appear first. */ + if (cinfo->unread_marker == 0) { + if (! cinfo->marker->saw_SOI) { + if (! first_marker(cinfo)) + return JPEG_SUSPENDED; + } else { + if (! next_marker(cinfo)) + return JPEG_SUSPENDED; + } + } + /* At this point cinfo->unread_marker contains the marker code and the + * input point is just past the marker proper, but before any parameters. + * A suspension will cause us to return with this state still true. + */ + switch (cinfo->unread_marker) { + case M_SOI: + if (! get_soi(cinfo)) + return JPEG_SUSPENDED; + break; + + case M_SOF0: /* Baseline */ + case M_SOF1: /* Extended sequential, Huffman */ + if (! get_sof(cinfo, FALSE, FALSE)) + return JPEG_SUSPENDED; + break; + + case M_SOF2: /* Progressive, Huffman */ + if (! get_sof(cinfo, TRUE, FALSE)) + return JPEG_SUSPENDED; + break; + + case M_SOF9: /* Extended sequential, arithmetic */ + if (! get_sof(cinfo, FALSE, TRUE)) + return JPEG_SUSPENDED; + break; + + case M_SOF10: /* Progressive, arithmetic */ + if (! get_sof(cinfo, TRUE, TRUE)) + return JPEG_SUSPENDED; + break; + + /* Currently unsupported SOFn types */ + case M_SOF3: /* Lossless, Huffman */ + case M_SOF5: /* Differential sequential, Huffman */ + case M_SOF6: /* Differential progressive, Huffman */ + case M_SOF7: /* Differential lossless, Huffman */ + case M_JPG: /* Reserved for JPEG extensions */ + case M_SOF11: /* Lossless, arithmetic */ + case M_SOF13: /* Differential sequential, arithmetic */ + case M_SOF14: /* Differential progressive, arithmetic */ + case M_SOF15: /* Differential lossless, arithmetic */ + ERREXIT1(cinfo, JERR_SOF_UNSUPPORTED, cinfo->unread_marker); + break; + + case M_SOS: + if (! get_sos(cinfo)) + return JPEG_SUSPENDED; + cinfo->unread_marker = 0; /* processed the marker */ + return JPEG_REACHED_SOS; + + case M_EOI: + TRACEMS(cinfo, 1, JTRC_EOI); + cinfo->unread_marker = 0; /* processed the marker */ + return JPEG_REACHED_EOI; + + case M_DAC: + if (! get_dac(cinfo)) + return JPEG_SUSPENDED; + break; + + case M_DHT: + if (! get_dht(cinfo)) + return JPEG_SUSPENDED; + break; + + case M_DQT: + if (! get_dqt(cinfo)) + return JPEG_SUSPENDED; + break; + + case M_DRI: + if (! get_dri(cinfo)) + return JPEG_SUSPENDED; + break; + + case M_APP0: + case M_APP1: + case M_APP2: + case M_APP3: + case M_APP4: + case M_APP5: + case M_APP6: + case M_APP7: + case M_APP8: + case M_APP9: + case M_APP10: + case M_APP11: + case M_APP12: + case M_APP13: + case M_APP14: + case M_APP15: + if (! (*((my_marker_ptr) cinfo->marker)->process_APPn[ + cinfo->unread_marker - (int) M_APP0]) (cinfo)) + return JPEG_SUSPENDED; + break; + + case M_COM: + if (! (*((my_marker_ptr) cinfo->marker)->process_COM) (cinfo)) + return JPEG_SUSPENDED; + break; + + case M_RST0: /* these are all parameterless */ + case M_RST1: + case M_RST2: + case M_RST3: + case M_RST4: + case M_RST5: + case M_RST6: + case M_RST7: + case M_TEM: + TRACEMS1(cinfo, 1, JTRC_PARMLESS_MARKER, cinfo->unread_marker); + break; + + case M_DNL: /* Ignore DNL ... perhaps the wrong thing */ + if (! skip_variable(cinfo)) + return JPEG_SUSPENDED; + break; + + default: /* must be DHP, EXP, JPGn, or RESn */ + /* For now, we treat the reserved markers as fatal errors since they are + * likely to be used to signal incompatible JPEG Part 3 extensions. + * Once the JPEG 3 version-number marker is well defined, this code + * ought to change! + */ + ERREXIT1(cinfo, JERR_UNKNOWN_MARKER, cinfo->unread_marker); + break; + } + /* Successfully processed marker, so reset state variable */ + cinfo->unread_marker = 0; + } /* end loop */ +} + + +/* + * Read a restart marker, which is expected to appear next in the datastream; + * if the marker is not there, take appropriate recovery action. + * Returns FALSE if suspension is required. + * + * This is called by the entropy decoder after it has read an appropriate + * number of MCUs. cinfo->unread_marker may be nonzero if the entropy decoder + * has already read a marker from the data source. Under normal conditions + * cinfo->unread_marker will be reset to 0 before returning; if not reset, + * it holds a marker which the decoder will be unable to read past. + */ + +METHODDEF(boolean) +read_restart_marker (j_decompress_ptr cinfo) +{ + /* Obtain a marker unless we already did. */ + /* Note that next_marker will complain if it skips any data. */ + if (cinfo->unread_marker == 0) { + if (! next_marker(cinfo)) + return FALSE; + } + + if (cinfo->unread_marker == + ((int) M_RST0 + cinfo->marker->next_restart_num)) { + /* Normal case --- swallow the marker and let entropy decoder continue */ + TRACEMS1(cinfo, 3, JTRC_RST, cinfo->marker->next_restart_num); + cinfo->unread_marker = 0; + } else { + /* Uh-oh, the restart markers have been messed up. */ + /* Let the data source manager determine how to resync. */ + if (! (*cinfo->src->resync_to_restart) (cinfo, + cinfo->marker->next_restart_num)) + return FALSE; + } + + /* Update next-restart state */ + cinfo->marker->next_restart_num = (cinfo->marker->next_restart_num + 1) & 7; + + return TRUE; +} + + +/* + * This is the default resync_to_restart method for data source managers + * to use if they don't have any better approach. Some data source managers + * may be able to back up, or may have additional knowledge about the data + * which permits a more intelligent recovery strategy; such managers would + * presumably supply their own resync method. + * + * read_restart_marker calls resync_to_restart if it finds a marker other than + * the restart marker it was expecting. (This code is *not* used unless + * a nonzero restart interval has been declared.) cinfo->unread_marker is + * the marker code actually found (might be anything, except 0 or FF). + * The desired restart marker number (0..7) is passed as a parameter. + * This routine is supposed to apply whatever error recovery strategy seems + * appropriate in order to position the input stream to the next data segment. + * Note that cinfo->unread_marker is treated as a marker appearing before + * the current data-source input point; usually it should be reset to zero + * before returning. + * Returns FALSE if suspension is required. + * + * This implementation is substantially constrained by wanting to treat the + * input as a data stream; this means we can't back up. Therefore, we have + * only the following actions to work with: + * 1. Simply discard the marker and let the entropy decoder resume at next + * byte of file. + * 2. Read forward until we find another marker, discarding intervening + * data. (In theory we could look ahead within the current bufferload, + * without having to discard data if we don't find the desired marker. + * This idea is not implemented here, in part because it makes behavior + * dependent on buffer size and chance buffer-boundary positions.) + * 3. Leave the marker unread (by failing to zero cinfo->unread_marker). + * This will cause the entropy decoder to process an empty data segment, + * inserting dummy zeroes, and then we will reprocess the marker. + * + * #2 is appropriate if we think the desired marker lies ahead, while #3 is + * appropriate if the found marker is a future restart marker (indicating + * that we have missed the desired restart marker, probably because it got + * corrupted). + * We apply #2 or #3 if the found marker is a restart marker no more than + * two counts behind or ahead of the expected one. We also apply #2 if the + * found marker is not a legal JPEG marker code (it's certainly bogus data). + * If the found marker is a restart marker more than 2 counts away, we do #1 + * (too much risk that the marker is erroneous; with luck we will be able to + * resync at some future point). + * For any valid non-restart JPEG marker, we apply #3. This keeps us from + * overrunning the end of a scan. An implementation limited to single-scan + * files might find it better to apply #2 for markers other than EOI, since + * any other marker would have to be bogus data in that case. + */ + +GLOBAL(boolean) +jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired) +{ + int marker = cinfo->unread_marker; + int action = 1; + + /* Always put up a warning. */ + WARNMS2(cinfo, JWRN_MUST_RESYNC, marker, desired); + + /* Outer loop handles repeated decision after scanning forward. */ + for (;;) { + if (marker < (int) M_SOF0) + action = 2; /* invalid marker */ + else if (marker < (int) M_RST0 || marker > (int) M_RST7) + action = 3; /* valid non-restart marker */ + else { + if (marker == ((int) M_RST0 + ((desired+1) & 7)) || + marker == ((int) M_RST0 + ((desired+2) & 7))) + action = 3; /* one of the next two expected restarts */ + else if (marker == ((int) M_RST0 + ((desired-1) & 7)) || + marker == ((int) M_RST0 + ((desired-2) & 7))) + action = 2; /* a prior restart, so advance */ + else + action = 1; /* desired restart or too far away */ + } + TRACEMS2(cinfo, 4, JTRC_RECOVERY_ACTION, marker, action); + switch (action) { + case 1: + /* Discard marker and let entropy decoder resume processing. */ + cinfo->unread_marker = 0; + return TRUE; + case 2: + /* Scan to the next marker, and repeat the decision loop. */ + if (! next_marker(cinfo)) + return FALSE; + marker = cinfo->unread_marker; + break; + case 3: + /* Return without advancing past this marker. */ + /* Entropy decoder will be forced to process an empty segment. */ + return TRUE; + } + } /* end loop */ +} + + +/* + * Reset marker processing state to begin a fresh datastream. + */ + +METHODDEF(void) +reset_marker_reader (j_decompress_ptr cinfo) +{ + my_marker_ptr marker = (my_marker_ptr) cinfo->marker; + + cinfo->comp_info = NULL; /* until allocated by get_sof */ + cinfo->input_scan_number = 0; /* no SOS seen yet */ + cinfo->unread_marker = 0; /* no pending marker */ + marker->pub.saw_SOI = FALSE; /* set internal state too */ + marker->pub.saw_SOF = FALSE; + marker->pub.discarded_bytes = 0; + marker->cur_marker = NULL; +} + + +/* + * Initialize the marker reader module. + * This is called only once, when the decompression object is created. + */ + +GLOBAL(void) +jinit_marker_reader (j_decompress_ptr cinfo) +{ + my_marker_ptr marker; + int i; + + /* Create subobject in permanent pool */ + marker = (my_marker_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT, + sizeof(my_marker_reader)); + cinfo->marker = (struct jpeg_marker_reader *) marker; + /* Initialize public method pointers */ + marker->pub.reset_marker_reader = reset_marker_reader; + marker->pub.read_markers = read_markers; + marker->pub.read_restart_marker = read_restart_marker; + /* Initialize COM/APPn processing. + * By default, we examine and then discard APP0 and APP14, + * but simply discard COM and all other APPn. + */ + marker->process_COM = skip_variable; + marker->length_limit_COM = 0; + for (i = 0; i < 16; i++) { + marker->process_APPn[i] = skip_variable; + marker->length_limit_APPn[i] = 0; + } + marker->process_APPn[0] = get_interesting_appn; + marker->process_APPn[14] = get_interesting_appn; + /* Reset marker processing state */ + reset_marker_reader(cinfo); +} + + +/* + * Control saving of COM and APPn markers into marker_list. + */ + +#ifdef SAVE_MARKERS_SUPPORTED + +GLOBAL(void) +jpeg_save_markers (j_decompress_ptr cinfo, int marker_code, + unsigned int length_limit) +{ + my_marker_ptr marker = (my_marker_ptr) cinfo->marker; + long maxlength; + jpeg_marker_parser_method processor; + + /* Length limit mustn't be larger than what we can allocate + * (should only be a concern in a 16-bit environment). + */ + maxlength = cinfo->mem->max_alloc_chunk - sizeof(struct jpeg_marker_struct); + if (((long) length_limit) > maxlength) + length_limit = (unsigned int) maxlength; + + /* Choose processor routine to use. + * APP0/APP14 have special requirements. + */ + if (length_limit) { + processor = save_marker; + /* If saving APP0/APP14, save at least enough for our internal use. */ + if (marker_code == (int) M_APP0 && length_limit < APP0_DATA_LEN) + length_limit = APP0_DATA_LEN; + else if (marker_code == (int) M_APP14 && length_limit < APP14_DATA_LEN) + length_limit = APP14_DATA_LEN; + } else { + processor = skip_variable; + /* If discarding APP0/APP14, use our regular on-the-fly processor. */ + if (marker_code == (int) M_APP0 || marker_code == (int) M_APP14) + processor = get_interesting_appn; + } + + if (marker_code == (int) M_COM) { + marker->process_COM = processor; + marker->length_limit_COM = length_limit; + } else if (marker_code >= (int) M_APP0 && marker_code <= (int) M_APP15) { + marker->process_APPn[marker_code - (int) M_APP0] = processor; + marker->length_limit_APPn[marker_code - (int) M_APP0] = length_limit; + } else + ERREXIT1(cinfo, JERR_UNKNOWN_MARKER, marker_code); +} + +#endif /* SAVE_MARKERS_SUPPORTED */ + + +/* + * Install a special processing method for COM or APPn markers. + */ + +GLOBAL(void) +jpeg_set_marker_processor (j_decompress_ptr cinfo, int marker_code, + jpeg_marker_parser_method routine) +{ + my_marker_ptr marker = (my_marker_ptr) cinfo->marker; + + if (marker_code == (int) M_COM) + marker->process_COM = routine; + else if (marker_code >= (int) M_APP0 && marker_code <= (int) M_APP15) + marker->process_APPn[marker_code - (int) M_APP0] = routine; + else + ERREXIT1(cinfo, JERR_UNKNOWN_MARKER, marker_code); +} diff --git a/media/libjpeg/jdmaster.c b/media/libjpeg/jdmaster.c new file mode 100644 index 000000000..9079dda65 --- /dev/null +++ b/media/libjpeg/jdmaster.c @@ -0,0 +1,736 @@ +/* + * jdmaster.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1997, Thomas G. Lane. + * Modified 2002-2009 by Guido Vollbeding. + * libjpeg-turbo Modifications: + * Copyright (C) 2009-2011, 2016, D. R. Commander. + * Copyright (C) 2013, Linaro Limited. + * Copyright (C) 2015, Google, Inc. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains master control logic for the JPEG decompressor. + * These routines are concerned with selecting the modules to be executed + * and with determining the number of passes and the work to be done in each + * pass. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jpegcomp.h" +#include "jdmaster.h" +#include "jsimd.h" + + +/* + * Determine whether merged upsample/color conversion should be used. + * CRUCIAL: this must match the actual capabilities of jdmerge.c! + */ + +LOCAL(boolean) +use_merged_upsample (j_decompress_ptr cinfo) +{ +#ifdef UPSAMPLE_MERGING_SUPPORTED + /* Merging is the equivalent of plain box-filter upsampling */ + if (cinfo->do_fancy_upsampling || cinfo->CCIR601_sampling) + return FALSE; + /* jdmerge.c only supports YCC=>RGB and YCC=>RGB565 color conversion */ + if (cinfo->jpeg_color_space != JCS_YCbCr || cinfo->num_components != 3 || + (cinfo->out_color_space != JCS_RGB && + cinfo->out_color_space != JCS_RGB565 && + cinfo->out_color_space != JCS_EXT_RGB && + cinfo->out_color_space != JCS_EXT_RGBX && + cinfo->out_color_space != JCS_EXT_BGR && + cinfo->out_color_space != JCS_EXT_BGRX && + cinfo->out_color_space != JCS_EXT_XBGR && + cinfo->out_color_space != JCS_EXT_XRGB && + cinfo->out_color_space != JCS_EXT_RGBA && + cinfo->out_color_space != JCS_EXT_BGRA && + cinfo->out_color_space != JCS_EXT_ABGR && + cinfo->out_color_space != JCS_EXT_ARGB)) + return FALSE; + if ((cinfo->out_color_space == JCS_RGB565 && + cinfo->out_color_components != 3) || + (cinfo->out_color_space != JCS_RGB565 && + cinfo->out_color_components != rgb_pixelsize[cinfo->out_color_space])) + return FALSE; + /* and it only handles 2h1v or 2h2v sampling ratios */ + if (cinfo->comp_info[0].h_samp_factor != 2 || + cinfo->comp_info[1].h_samp_factor != 1 || + cinfo->comp_info[2].h_samp_factor != 1 || + cinfo->comp_info[0].v_samp_factor > 2 || + cinfo->comp_info[1].v_samp_factor != 1 || + cinfo->comp_info[2].v_samp_factor != 1) + return FALSE; + /* furthermore, it doesn't work if we've scaled the IDCTs differently */ + if (cinfo->comp_info[0]._DCT_scaled_size != cinfo->_min_DCT_scaled_size || + cinfo->comp_info[1]._DCT_scaled_size != cinfo->_min_DCT_scaled_size || + cinfo->comp_info[2]._DCT_scaled_size != cinfo->_min_DCT_scaled_size) + return FALSE; +#ifdef WITH_SIMD + /* If YCbCr-to-RGB color conversion is SIMD-accelerated but merged upsampling + isn't, then disabling merged upsampling is likely to be faster when + decompressing YCbCr JPEG images. */ + if (!jsimd_can_h2v2_merged_upsample() && !jsimd_can_h2v1_merged_upsample() && + jsimd_can_ycc_rgb() && cinfo->jpeg_color_space == JCS_YCbCr && + (cinfo->out_color_space == JCS_RGB || + (cinfo->out_color_space >= JCS_EXT_RGB && + cinfo->out_color_space <= JCS_EXT_ARGB))) + return FALSE; +#endif + /* ??? also need to test for upsample-time rescaling, when & if supported */ + return TRUE; /* by golly, it'll work... */ +#else + return FALSE; +#endif +} + + +/* + * Compute output image dimensions and related values. + * NOTE: this is exported for possible use by application. + * Hence it mustn't do anything that can't be done twice. + */ + +#if JPEG_LIB_VERSION >= 80 +GLOBAL(void) +#else +LOCAL(void) +#endif +jpeg_core_output_dimensions (j_decompress_ptr cinfo) +/* Do computations that are needed before master selection phase. + * This function is used for transcoding and full decompression. + */ +{ +#ifdef IDCT_SCALING_SUPPORTED + int ci; + jpeg_component_info *compptr; + + /* Compute actual output image dimensions and DCT scaling choices. */ + if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom) { + /* Provide 1/block_size scaling */ + cinfo->output_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width, (long) DCTSIZE); + cinfo->output_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height, (long) DCTSIZE); + cinfo->_min_DCT_h_scaled_size = 1; + cinfo->_min_DCT_v_scaled_size = 1; + } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 2) { + /* Provide 2/block_size scaling */ + cinfo->output_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 2L, (long) DCTSIZE); + cinfo->output_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 2L, (long) DCTSIZE); + cinfo->_min_DCT_h_scaled_size = 2; + cinfo->_min_DCT_v_scaled_size = 2; + } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 3) { + /* Provide 3/block_size scaling */ + cinfo->output_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 3L, (long) DCTSIZE); + cinfo->output_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 3L, (long) DCTSIZE); + cinfo->_min_DCT_h_scaled_size = 3; + cinfo->_min_DCT_v_scaled_size = 3; + } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 4) { + /* Provide 4/block_size scaling */ + cinfo->output_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 4L, (long) DCTSIZE); + cinfo->output_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 4L, (long) DCTSIZE); + cinfo->_min_DCT_h_scaled_size = 4; + cinfo->_min_DCT_v_scaled_size = 4; + } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 5) { + /* Provide 5/block_size scaling */ + cinfo->output_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 5L, (long) DCTSIZE); + cinfo->output_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 5L, (long) DCTSIZE); + cinfo->_min_DCT_h_scaled_size = 5; + cinfo->_min_DCT_v_scaled_size = 5; + } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 6) { + /* Provide 6/block_size scaling */ + cinfo->output_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 6L, (long) DCTSIZE); + cinfo->output_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 6L, (long) DCTSIZE); + cinfo->_min_DCT_h_scaled_size = 6; + cinfo->_min_DCT_v_scaled_size = 6; + } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 7) { + /* Provide 7/block_size scaling */ + cinfo->output_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 7L, (long) DCTSIZE); + cinfo->output_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 7L, (long) DCTSIZE); + cinfo->_min_DCT_h_scaled_size = 7; + cinfo->_min_DCT_v_scaled_size = 7; + } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 8) { + /* Provide 8/block_size scaling */ + cinfo->output_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 8L, (long) DCTSIZE); + cinfo->output_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 8L, (long) DCTSIZE); + cinfo->_min_DCT_h_scaled_size = 8; + cinfo->_min_DCT_v_scaled_size = 8; + } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 9) { + /* Provide 9/block_size scaling */ + cinfo->output_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 9L, (long) DCTSIZE); + cinfo->output_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 9L, (long) DCTSIZE); + cinfo->_min_DCT_h_scaled_size = 9; + cinfo->_min_DCT_v_scaled_size = 9; + } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 10) { + /* Provide 10/block_size scaling */ + cinfo->output_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 10L, (long) DCTSIZE); + cinfo->output_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 10L, (long) DCTSIZE); + cinfo->_min_DCT_h_scaled_size = 10; + cinfo->_min_DCT_v_scaled_size = 10; + } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 11) { + /* Provide 11/block_size scaling */ + cinfo->output_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 11L, (long) DCTSIZE); + cinfo->output_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 11L, (long) DCTSIZE); + cinfo->_min_DCT_h_scaled_size = 11; + cinfo->_min_DCT_v_scaled_size = 11; + } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 12) { + /* Provide 12/block_size scaling */ + cinfo->output_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 12L, (long) DCTSIZE); + cinfo->output_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 12L, (long) DCTSIZE); + cinfo->_min_DCT_h_scaled_size = 12; + cinfo->_min_DCT_v_scaled_size = 12; + } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 13) { + /* Provide 13/block_size scaling */ + cinfo->output_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 13L, (long) DCTSIZE); + cinfo->output_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 13L, (long) DCTSIZE); + cinfo->_min_DCT_h_scaled_size = 13; + cinfo->_min_DCT_v_scaled_size = 13; + } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 14) { + /* Provide 14/block_size scaling */ + cinfo->output_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 14L, (long) DCTSIZE); + cinfo->output_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 14L, (long) DCTSIZE); + cinfo->_min_DCT_h_scaled_size = 14; + cinfo->_min_DCT_v_scaled_size = 14; + } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 15) { + /* Provide 15/block_size scaling */ + cinfo->output_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 15L, (long) DCTSIZE); + cinfo->output_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 15L, (long) DCTSIZE); + cinfo->_min_DCT_h_scaled_size = 15; + cinfo->_min_DCT_v_scaled_size = 15; + } else { + /* Provide 16/block_size scaling */ + cinfo->output_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 16L, (long) DCTSIZE); + cinfo->output_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 16L, (long) DCTSIZE); + cinfo->_min_DCT_h_scaled_size = 16; + cinfo->_min_DCT_v_scaled_size = 16; + } + + /* Recompute dimensions of components */ + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + compptr->_DCT_h_scaled_size = cinfo->_min_DCT_h_scaled_size; + compptr->_DCT_v_scaled_size = cinfo->_min_DCT_v_scaled_size; + } + +#else /* !IDCT_SCALING_SUPPORTED */ + + /* Hardwire it to "no scaling" */ + cinfo->output_width = cinfo->image_width; + cinfo->output_height = cinfo->image_height; + /* jdinput.c has already initialized DCT_scaled_size, + * and has computed unscaled downsampled_width and downsampled_height. + */ + +#endif /* IDCT_SCALING_SUPPORTED */ +} + + +/* + * Compute output image dimensions and related values. + * NOTE: this is exported for possible use by application. + * Hence it mustn't do anything that can't be done twice. + * Also note that it may be called before the master module is initialized! + */ + +GLOBAL(void) +jpeg_calc_output_dimensions (j_decompress_ptr cinfo) +/* Do computations that are needed before master selection phase */ +{ +#ifdef IDCT_SCALING_SUPPORTED + int ci; + jpeg_component_info *compptr; +#endif + + /* Prevent application from calling me at wrong times */ + if (cinfo->global_state != DSTATE_READY) + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + + /* Compute core output image dimensions and DCT scaling choices. */ + jpeg_core_output_dimensions(cinfo); + +#ifdef IDCT_SCALING_SUPPORTED + + /* In selecting the actual DCT scaling for each component, we try to + * scale up the chroma components via IDCT scaling rather than upsampling. + * This saves time if the upsampler gets to use 1:1 scaling. + * Note this code adapts subsampling ratios which are powers of 2. + */ + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + int ssize = cinfo->_min_DCT_scaled_size; + while (ssize < DCTSIZE && + ((cinfo->max_h_samp_factor * cinfo->_min_DCT_scaled_size) % + (compptr->h_samp_factor * ssize * 2) == 0) && + ((cinfo->max_v_samp_factor * cinfo->_min_DCT_scaled_size) % + (compptr->v_samp_factor * ssize * 2) == 0)) { + ssize = ssize * 2; + } +#if JPEG_LIB_VERSION >= 70 + compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size = ssize; +#else + compptr->DCT_scaled_size = ssize; +#endif + } + + /* Recompute downsampled dimensions of components; + * application needs to know these if using raw downsampled data. + */ + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + /* Size in samples, after IDCT scaling */ + compptr->downsampled_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * + (long) (compptr->h_samp_factor * compptr->_DCT_scaled_size), + (long) (cinfo->max_h_samp_factor * DCTSIZE)); + compptr->downsampled_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * + (long) (compptr->v_samp_factor * compptr->_DCT_scaled_size), + (long) (cinfo->max_v_samp_factor * DCTSIZE)); + } + +#else /* !IDCT_SCALING_SUPPORTED */ + + /* Hardwire it to "no scaling" */ + cinfo->output_width = cinfo->image_width; + cinfo->output_height = cinfo->image_height; + /* jdinput.c has already initialized DCT_scaled_size to DCTSIZE, + * and has computed unscaled downsampled_width and downsampled_height. + */ + +#endif /* IDCT_SCALING_SUPPORTED */ + + /* Report number of components in selected colorspace. */ + /* Probably this should be in the color conversion module... */ + switch (cinfo->out_color_space) { + case JCS_GRAYSCALE: + cinfo->out_color_components = 1; + break; + case JCS_RGB: + case JCS_EXT_RGB: + case JCS_EXT_RGBX: + case JCS_EXT_BGR: + case JCS_EXT_BGRX: + case JCS_EXT_XBGR: + case JCS_EXT_XRGB: + case JCS_EXT_RGBA: + case JCS_EXT_BGRA: + case JCS_EXT_ABGR: + case JCS_EXT_ARGB: + cinfo->out_color_components = rgb_pixelsize[cinfo->out_color_space]; + break; + case JCS_YCbCr: + case JCS_RGB565: + cinfo->out_color_components = 3; + break; + case JCS_CMYK: + case JCS_YCCK: + cinfo->out_color_components = 4; + break; + default: /* else must be same colorspace as in file */ + cinfo->out_color_components = cinfo->num_components; + break; + } + cinfo->output_components = (cinfo->quantize_colors ? 1 : + cinfo->out_color_components); + + /* See if upsampler will want to emit more than one row at a time */ + if (use_merged_upsample(cinfo)) + cinfo->rec_outbuf_height = cinfo->max_v_samp_factor; + else + cinfo->rec_outbuf_height = 1; +} + + +/* + * Several decompression processes need to range-limit values to the range + * 0..MAXJSAMPLE; the input value may fall somewhat outside this range + * due to noise introduced by quantization, roundoff error, etc. These + * processes are inner loops and need to be as fast as possible. On most + * machines, particularly CPUs with pipelines or instruction prefetch, + * a (subscript-check-less) C table lookup + * x = sample_range_limit[x]; + * is faster than explicit tests + * if (x < 0) x = 0; + * else if (x > MAXJSAMPLE) x = MAXJSAMPLE; + * These processes all use a common table prepared by the routine below. + * + * For most steps we can mathematically guarantee that the initial value + * of x is within MAXJSAMPLE+1 of the legal range, so a table running from + * -(MAXJSAMPLE+1) to 2*MAXJSAMPLE+1 is sufficient. But for the initial + * limiting step (just after the IDCT), a wildly out-of-range value is + * possible if the input data is corrupt. To avoid any chance of indexing + * off the end of memory and getting a bad-pointer trap, we perform the + * post-IDCT limiting thus: + * x = range_limit[x & MASK]; + * where MASK is 2 bits wider than legal sample data, ie 10 bits for 8-bit + * samples. Under normal circumstances this is more than enough range and + * a correct output will be generated; with bogus input data the mask will + * cause wraparound, and we will safely generate a bogus-but-in-range output. + * For the post-IDCT step, we want to convert the data from signed to unsigned + * representation by adding CENTERJSAMPLE at the same time that we limit it. + * So the post-IDCT limiting table ends up looking like this: + * CENTERJSAMPLE,CENTERJSAMPLE+1,...,MAXJSAMPLE, + * MAXJSAMPLE (repeat 2*(MAXJSAMPLE+1)-CENTERJSAMPLE times), + * 0 (repeat 2*(MAXJSAMPLE+1)-CENTERJSAMPLE times), + * 0,1,...,CENTERJSAMPLE-1 + * Negative inputs select values from the upper half of the table after + * masking. + * + * We can save some space by overlapping the start of the post-IDCT table + * with the simpler range limiting table. The post-IDCT table begins at + * sample_range_limit + CENTERJSAMPLE. + */ + +LOCAL(void) +prepare_range_limit_table (j_decompress_ptr cinfo) +/* Allocate and fill in the sample_range_limit table */ +{ + JSAMPLE *table; + int i; + + table = (JSAMPLE *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + (5 * (MAXJSAMPLE+1) + CENTERJSAMPLE) * sizeof(JSAMPLE)); + table += (MAXJSAMPLE+1); /* allow negative subscripts of simple table */ + cinfo->sample_range_limit = table; + /* First segment of "simple" table: limit[x] = 0 for x < 0 */ + MEMZERO(table - (MAXJSAMPLE+1), (MAXJSAMPLE+1) * sizeof(JSAMPLE)); + /* Main part of "simple" table: limit[x] = x */ + for (i = 0; i <= MAXJSAMPLE; i++) + table[i] = (JSAMPLE) i; + table += CENTERJSAMPLE; /* Point to where post-IDCT table starts */ + /* End of simple table, rest of first half of post-IDCT table */ + for (i = CENTERJSAMPLE; i < 2*(MAXJSAMPLE+1); i++) + table[i] = MAXJSAMPLE; + /* Second half of post-IDCT table */ + MEMZERO(table + (2 * (MAXJSAMPLE+1)), + (2 * (MAXJSAMPLE+1) - CENTERJSAMPLE) * sizeof(JSAMPLE)); + MEMCOPY(table + (4 * (MAXJSAMPLE+1) - CENTERJSAMPLE), + cinfo->sample_range_limit, CENTERJSAMPLE * sizeof(JSAMPLE)); +} + + +/* + * Master selection of decompression modules. + * This is done once at jpeg_start_decompress time. We determine + * which modules will be used and give them appropriate initialization calls. + * We also initialize the decompressor input side to begin consuming data. + * + * Since jpeg_read_header has finished, we know what is in the SOF + * and (first) SOS markers. We also have all the application parameter + * settings. + */ + +LOCAL(void) +master_selection (j_decompress_ptr cinfo) +{ + my_master_ptr master = (my_master_ptr) cinfo->master; + boolean use_c_buffer; + long samplesperrow; + JDIMENSION jd_samplesperrow; + + /* Initialize dimensions and other stuff */ + jpeg_calc_output_dimensions(cinfo); + prepare_range_limit_table(cinfo); + + /* Width of an output scanline must be representable as JDIMENSION. */ + samplesperrow = (long) cinfo->output_width * (long) cinfo->out_color_components; + jd_samplesperrow = (JDIMENSION) samplesperrow; + if ((long) jd_samplesperrow != samplesperrow) + ERREXIT(cinfo, JERR_WIDTH_OVERFLOW); + + /* Initialize my private state */ + master->pass_number = 0; + master->using_merged_upsample = use_merged_upsample(cinfo); + + /* Color quantizer selection */ + master->quantizer_1pass = NULL; + master->quantizer_2pass = NULL; + /* No mode changes if not using buffered-image mode. */ + if (! cinfo->quantize_colors || ! cinfo->buffered_image) { + cinfo->enable_1pass_quant = FALSE; + cinfo->enable_external_quant = FALSE; + cinfo->enable_2pass_quant = FALSE; + } + if (cinfo->quantize_colors) { + if (cinfo->raw_data_out) + ERREXIT(cinfo, JERR_NOTIMPL); + /* 2-pass quantizer only works in 3-component color space. */ + if (cinfo->out_color_components != 3) { + cinfo->enable_1pass_quant = TRUE; + cinfo->enable_external_quant = FALSE; + cinfo->enable_2pass_quant = FALSE; + cinfo->colormap = NULL; + } else if (cinfo->colormap != NULL) { + cinfo->enable_external_quant = TRUE; + } else if (cinfo->two_pass_quantize) { + cinfo->enable_2pass_quant = TRUE; + } else { + cinfo->enable_1pass_quant = TRUE; + } + + if (cinfo->enable_1pass_quant) { +#ifdef QUANT_1PASS_SUPPORTED + jinit_1pass_quantizer(cinfo); + master->quantizer_1pass = cinfo->cquantize; +#else + ERREXIT(cinfo, JERR_NOT_COMPILED); +#endif + } + + /* We use the 2-pass code to map to external colormaps. */ + if (cinfo->enable_2pass_quant || cinfo->enable_external_quant) { +#ifdef QUANT_2PASS_SUPPORTED + jinit_2pass_quantizer(cinfo); + master->quantizer_2pass = cinfo->cquantize; +#else + ERREXIT(cinfo, JERR_NOT_COMPILED); +#endif + } + /* If both quantizers are initialized, the 2-pass one is left active; + * this is necessary for starting with quantization to an external map. + */ + } + + /* Post-processing: in particular, color conversion first */ + if (! cinfo->raw_data_out) { + if (master->using_merged_upsample) { +#ifdef UPSAMPLE_MERGING_SUPPORTED + jinit_merged_upsampler(cinfo); /* does color conversion too */ +#else + ERREXIT(cinfo, JERR_NOT_COMPILED); +#endif + } else { + jinit_color_deconverter(cinfo); + jinit_upsampler(cinfo); + } + jinit_d_post_controller(cinfo, cinfo->enable_2pass_quant); + } + /* Inverse DCT */ + jinit_inverse_dct(cinfo); + /* Entropy decoding: either Huffman or arithmetic coding. */ + if (cinfo->arith_code) { +#ifdef D_ARITH_CODING_SUPPORTED + jinit_arith_decoder(cinfo); +#else + ERREXIT(cinfo, JERR_ARITH_NOTIMPL); +#endif + } else { + if (cinfo->progressive_mode) { +#ifdef D_PROGRESSIVE_SUPPORTED + jinit_phuff_decoder(cinfo); +#else + ERREXIT(cinfo, JERR_NOT_COMPILED); +#endif + } else + jinit_huff_decoder(cinfo); + } + + /* Initialize principal buffer controllers. */ + use_c_buffer = cinfo->inputctl->has_multiple_scans || cinfo->buffered_image; + jinit_d_coef_controller(cinfo, use_c_buffer); + + if (! cinfo->raw_data_out) + jinit_d_main_controller(cinfo, FALSE /* never need full buffer here */); + + /* We can now tell the memory manager to allocate virtual arrays. */ + (*cinfo->mem->realize_virt_arrays) ((j_common_ptr) cinfo); + + /* Initialize input side of decompressor to consume first scan. */ + (*cinfo->inputctl->start_input_pass) (cinfo); + + /* Set the first and last iMCU columns to decompress from single-scan images. + * By default, decompress all of the iMCU columns. + */ + cinfo->master->first_iMCU_col = 0; + cinfo->master->last_iMCU_col = cinfo->MCUs_per_row - 1; + +#ifdef D_MULTISCAN_FILES_SUPPORTED + /* If jpeg_start_decompress will read the whole file, initialize + * progress monitoring appropriately. The input step is counted + * as one pass. + */ + if (cinfo->progress != NULL && ! cinfo->buffered_image && + cinfo->inputctl->has_multiple_scans) { + int nscans; + /* Estimate number of scans to set pass_limit. */ + if (cinfo->progressive_mode) { + /* Arbitrarily estimate 2 interleaved DC scans + 3 AC scans/component. */ + nscans = 2 + 3 * cinfo->num_components; + } else { + /* For a nonprogressive multiscan file, estimate 1 scan per component. */ + nscans = cinfo->num_components; + } + cinfo->progress->pass_counter = 0L; + cinfo->progress->pass_limit = (long) cinfo->total_iMCU_rows * nscans; + cinfo->progress->completed_passes = 0; + cinfo->progress->total_passes = (cinfo->enable_2pass_quant ? 3 : 2); + /* Count the input pass as done */ + master->pass_number++; + } +#endif /* D_MULTISCAN_FILES_SUPPORTED */ +} + + +/* + * Per-pass setup. + * This is called at the beginning of each output pass. We determine which + * modules will be active during this pass and give them appropriate + * start_pass calls. We also set is_dummy_pass to indicate whether this + * is a "real" output pass or a dummy pass for color quantization. + * (In the latter case, jdapistd.c will crank the pass to completion.) + */ + +METHODDEF(void) +prepare_for_output_pass (j_decompress_ptr cinfo) +{ + my_master_ptr master = (my_master_ptr) cinfo->master; + + if (master->pub.is_dummy_pass) { +#ifdef QUANT_2PASS_SUPPORTED + /* Final pass of 2-pass quantization */ + master->pub.is_dummy_pass = FALSE; + (*cinfo->cquantize->start_pass) (cinfo, FALSE); + (*cinfo->post->start_pass) (cinfo, JBUF_CRANK_DEST); + (*cinfo->main->start_pass) (cinfo, JBUF_CRANK_DEST); +#else + ERREXIT(cinfo, JERR_NOT_COMPILED); +#endif /* QUANT_2PASS_SUPPORTED */ + } else { + if (cinfo->quantize_colors && cinfo->colormap == NULL) { + /* Select new quantization method */ + if (cinfo->two_pass_quantize && cinfo->enable_2pass_quant) { + cinfo->cquantize = master->quantizer_2pass; + master->pub.is_dummy_pass = TRUE; + } else if (cinfo->enable_1pass_quant) { + cinfo->cquantize = master->quantizer_1pass; + } else { + ERREXIT(cinfo, JERR_MODE_CHANGE); + } + } + (*cinfo->idct->start_pass) (cinfo); + (*cinfo->coef->start_output_pass) (cinfo); + if (! cinfo->raw_data_out) { + if (! master->using_merged_upsample) + (*cinfo->cconvert->start_pass) (cinfo); + (*cinfo->upsample->start_pass) (cinfo); + if (cinfo->quantize_colors) + (*cinfo->cquantize->start_pass) (cinfo, master->pub.is_dummy_pass); + (*cinfo->post->start_pass) (cinfo, + (master->pub.is_dummy_pass ? JBUF_SAVE_AND_PASS : JBUF_PASS_THRU)); + (*cinfo->main->start_pass) (cinfo, JBUF_PASS_THRU); + } + } + + /* Set up progress monitor's pass info if present */ + if (cinfo->progress != NULL) { + cinfo->progress->completed_passes = master->pass_number; + cinfo->progress->total_passes = master->pass_number + + (master->pub.is_dummy_pass ? 2 : 1); + /* In buffered-image mode, we assume one more output pass if EOI not + * yet reached, but no more passes if EOI has been reached. + */ + if (cinfo->buffered_image && ! cinfo->inputctl->eoi_reached) { + cinfo->progress->total_passes += (cinfo->enable_2pass_quant ? 2 : 1); + } + } +} + + +/* + * Finish up at end of an output pass. + */ + +METHODDEF(void) +finish_output_pass (j_decompress_ptr cinfo) +{ + my_master_ptr master = (my_master_ptr) cinfo->master; + + if (cinfo->quantize_colors) + (*cinfo->cquantize->finish_pass) (cinfo); + master->pass_number++; +} + + +#ifdef D_MULTISCAN_FILES_SUPPORTED + +/* + * Switch to a new external colormap between output passes. + */ + +GLOBAL(void) +jpeg_new_colormap (j_decompress_ptr cinfo) +{ + my_master_ptr master = (my_master_ptr) cinfo->master; + + /* Prevent application from calling me at wrong times */ + if (cinfo->global_state != DSTATE_BUFIMAGE) + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + + if (cinfo->quantize_colors && cinfo->enable_external_quant && + cinfo->colormap != NULL) { + /* Select 2-pass quantizer for external colormap use */ + cinfo->cquantize = master->quantizer_2pass; + /* Notify quantizer of colormap change */ + (*cinfo->cquantize->new_color_map) (cinfo); + master->pub.is_dummy_pass = FALSE; /* just in case */ + } else + ERREXIT(cinfo, JERR_MODE_CHANGE); +} + +#endif /* D_MULTISCAN_FILES_SUPPORTED */ + + +/* + * Initialize master decompression control and select active modules. + * This is performed at the start of jpeg_start_decompress. + */ + +GLOBAL(void) +jinit_master_decompress (j_decompress_ptr cinfo) +{ + my_master_ptr master = (my_master_ptr) cinfo->master; + + master->pub.prepare_for_output_pass = prepare_for_output_pass; + master->pub.finish_output_pass = finish_output_pass; + + master->pub.is_dummy_pass = FALSE; + master->pub.jinit_upsampler_no_alloc = FALSE; + + master_selection(cinfo); +} diff --git a/media/libjpeg/jdmaster.h b/media/libjpeg/jdmaster.h new file mode 100644 index 000000000..76897e282 --- /dev/null +++ b/media/libjpeg/jdmaster.h @@ -0,0 +1,28 @@ +/* + * jdmaster.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1995, Thomas G. Lane. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains the master control structure for the JPEG decompressor. + */ + +/* Private state */ + +typedef struct { + struct jpeg_decomp_master pub; /* public fields */ + + int pass_number; /* # of passes completed */ + + boolean using_merged_upsample; /* TRUE if using merged upsample/cconvert */ + + /* Saved references to initialized quantizer modules, + * in case we need to switch modes. + */ + struct jpeg_color_quantizer *quantizer_1pass; + struct jpeg_color_quantizer *quantizer_2pass; +} my_decomp_master; + +typedef my_decomp_master *my_master_ptr; diff --git a/media/libjpeg/jdmerge.c b/media/libjpeg/jdmerge.c new file mode 100644 index 000000000..6276dd095 --- /dev/null +++ b/media/libjpeg/jdmerge.c @@ -0,0 +1,627 @@ +/* + * jdmerge.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1994-1996, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2009, 2011, 2014-2015, D. R. Commander. + * Copyright (C) 2013, Linaro Limited. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains code for merged upsampling/color conversion. + * + * This file combines functions from jdsample.c and jdcolor.c; + * read those files first to understand what's going on. + * + * When the chroma components are to be upsampled by simple replication + * (ie, box filtering), we can save some work in color conversion by + * calculating all the output pixels corresponding to a pair of chroma + * samples at one time. In the conversion equations + * R = Y + K1 * Cr + * G = Y + K2 * Cb + K3 * Cr + * B = Y + K4 * Cb + * only the Y term varies among the group of pixels corresponding to a pair + * of chroma samples, so the rest of the terms can be calculated just once. + * At typical sampling ratios, this eliminates half or three-quarters of the + * multiplications needed for color conversion. + * + * This file currently provides implementations for the following cases: + * YCbCr => RGB color conversion only. + * Sampling ratios of 2h1v or 2h2v. + * No scaling needed at upsample time. + * Corner-aligned (non-CCIR601) sampling alignment. + * Other special cases could be added, but in most applications these are + * the only common cases. (For uncommon cases we fall back on the more + * general code in jdsample.c and jdcolor.c.) + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jsimd.h" +#include "jconfigint.h" + +#ifdef UPSAMPLE_MERGING_SUPPORTED + + +/* Private subobject */ + +typedef struct { + struct jpeg_upsampler pub; /* public fields */ + + /* Pointer to routine to do actual upsampling/conversion of one row group */ + void (*upmethod) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); + + /* Private state for YCC->RGB conversion */ + int *Cr_r_tab; /* => table for Cr to R conversion */ + int *Cb_b_tab; /* => table for Cb to B conversion */ + JLONG *Cr_g_tab; /* => table for Cr to G conversion */ + JLONG *Cb_g_tab; /* => table for Cb to G conversion */ + + /* For 2:1 vertical sampling, we produce two output rows at a time. + * We need a "spare" row buffer to hold the second output row if the + * application provides just a one-row buffer; we also use the spare + * to discard the dummy last row if the image height is odd. + */ + JSAMPROW spare_row; + boolean spare_full; /* T if spare buffer is occupied */ + + JDIMENSION out_row_width; /* samples per output row */ + JDIMENSION rows_to_go; /* counts rows remaining in image */ +} my_upsampler; + +typedef my_upsampler *my_upsample_ptr; + +#define SCALEBITS 16 /* speediest right-shift on some machines */ +#define ONE_HALF ((JLONG) 1 << (SCALEBITS-1)) +#define FIX(x) ((JLONG) ((x) * (1L<<SCALEBITS) + 0.5)) + + +/* Include inline routines for colorspace extensions */ + +#include "jdmrgext.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE + +#define RGB_RED EXT_RGB_RED +#define RGB_GREEN EXT_RGB_GREEN +#define RGB_BLUE EXT_RGB_BLUE +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define h2v1_merged_upsample_internal extrgb_h2v1_merged_upsample_internal +#define h2v2_merged_upsample_internal extrgb_h2v2_merged_upsample_internal +#include "jdmrgext.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef h2v1_merged_upsample_internal +#undef h2v2_merged_upsample_internal + +#define RGB_RED EXT_RGBX_RED +#define RGB_GREEN EXT_RGBX_GREEN +#define RGB_BLUE EXT_RGBX_BLUE +#define RGB_ALPHA 3 +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define h2v1_merged_upsample_internal extrgbx_h2v1_merged_upsample_internal +#define h2v2_merged_upsample_internal extrgbx_h2v2_merged_upsample_internal +#include "jdmrgext.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef h2v1_merged_upsample_internal +#undef h2v2_merged_upsample_internal + +#define RGB_RED EXT_BGR_RED +#define RGB_GREEN EXT_BGR_GREEN +#define RGB_BLUE EXT_BGR_BLUE +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define h2v1_merged_upsample_internal extbgr_h2v1_merged_upsample_internal +#define h2v2_merged_upsample_internal extbgr_h2v2_merged_upsample_internal +#include "jdmrgext.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef h2v1_merged_upsample_internal +#undef h2v2_merged_upsample_internal + +#define RGB_RED EXT_BGRX_RED +#define RGB_GREEN EXT_BGRX_GREEN +#define RGB_BLUE EXT_BGRX_BLUE +#define RGB_ALPHA 3 +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define h2v1_merged_upsample_internal extbgrx_h2v1_merged_upsample_internal +#define h2v2_merged_upsample_internal extbgrx_h2v2_merged_upsample_internal +#include "jdmrgext.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef h2v1_merged_upsample_internal +#undef h2v2_merged_upsample_internal + +#define RGB_RED EXT_XBGR_RED +#define RGB_GREEN EXT_XBGR_GREEN +#define RGB_BLUE EXT_XBGR_BLUE +#define RGB_ALPHA 0 +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define h2v1_merged_upsample_internal extxbgr_h2v1_merged_upsample_internal +#define h2v2_merged_upsample_internal extxbgr_h2v2_merged_upsample_internal +#include "jdmrgext.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef h2v1_merged_upsample_internal +#undef h2v2_merged_upsample_internal + +#define RGB_RED EXT_XRGB_RED +#define RGB_GREEN EXT_XRGB_GREEN +#define RGB_BLUE EXT_XRGB_BLUE +#define RGB_ALPHA 0 +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define h2v1_merged_upsample_internal extxrgb_h2v1_merged_upsample_internal +#define h2v2_merged_upsample_internal extxrgb_h2v2_merged_upsample_internal +#include "jdmrgext.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef h2v1_merged_upsample_internal +#undef h2v2_merged_upsample_internal + + +/* + * Initialize tables for YCC->RGB colorspace conversion. + * This is taken directly from jdcolor.c; see that file for more info. + */ + +LOCAL(void) +build_ycc_rgb_table (j_decompress_ptr cinfo) +{ + my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample; + int i; + JLONG x; + SHIFT_TEMPS + + upsample->Cr_r_tab = (int *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + (MAXJSAMPLE+1) * sizeof(int)); + upsample->Cb_b_tab = (int *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + (MAXJSAMPLE+1) * sizeof(int)); + upsample->Cr_g_tab = (JLONG *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + (MAXJSAMPLE+1) * sizeof(JLONG)); + upsample->Cb_g_tab = (JLONG *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + (MAXJSAMPLE+1) * sizeof(JLONG)); + + for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) { + /* i is the actual input pixel value, in the range 0..MAXJSAMPLE */ + /* The Cb or Cr value we are thinking of is x = i - CENTERJSAMPLE */ + /* Cr=>R value is nearest int to 1.40200 * x */ + upsample->Cr_r_tab[i] = (int) + RIGHT_SHIFT(FIX(1.40200) * x + ONE_HALF, SCALEBITS); + /* Cb=>B value is nearest int to 1.77200 * x */ + upsample->Cb_b_tab[i] = (int) + RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS); + /* Cr=>G value is scaled-up -0.71414 * x */ + upsample->Cr_g_tab[i] = (- FIX(0.71414)) * x; + /* Cb=>G value is scaled-up -0.34414 * x */ + /* We also add in ONE_HALF so that need not do it in inner loop */ + upsample->Cb_g_tab[i] = (- FIX(0.34414)) * x + ONE_HALF; + } +} + + +/* + * Initialize for an upsampling pass. + */ + +METHODDEF(void) +start_pass_merged_upsample (j_decompress_ptr cinfo) +{ + my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample; + + /* Mark the spare buffer empty */ + upsample->spare_full = FALSE; + /* Initialize total-height counter for detecting bottom of image */ + upsample->rows_to_go = cinfo->output_height; +} + + +/* + * Control routine to do upsampling (and color conversion). + * + * The control routine just handles the row buffering considerations. + */ + +METHODDEF(void) +merged_2v_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr, + JDIMENSION in_row_groups_avail, + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail) +/* 2:1 vertical sampling case: may need a spare row. */ +{ + my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample; + JSAMPROW work_ptrs[2]; + JDIMENSION num_rows; /* number of rows returned to caller */ + + if (upsample->spare_full) { + /* If we have a spare row saved from a previous cycle, just return it. */ + JDIMENSION size = upsample->out_row_width; + if (cinfo->out_color_space == JCS_RGB565) + size = cinfo->output_width * 2; + jcopy_sample_rows(& upsample->spare_row, 0, output_buf + *out_row_ctr, 0, + 1, size); + num_rows = 1; + upsample->spare_full = FALSE; + } else { + /* Figure number of rows to return to caller. */ + num_rows = 2; + /* Not more than the distance to the end of the image. */ + if (num_rows > upsample->rows_to_go) + num_rows = upsample->rows_to_go; + /* And not more than what the client can accept: */ + out_rows_avail -= *out_row_ctr; + if (num_rows > out_rows_avail) + num_rows = out_rows_avail; + /* Create output pointer array for upsampler. */ + work_ptrs[0] = output_buf[*out_row_ctr]; + if (num_rows > 1) { + work_ptrs[1] = output_buf[*out_row_ctr + 1]; + } else { + work_ptrs[1] = upsample->spare_row; + upsample->spare_full = TRUE; + } + /* Now do the upsampling. */ + (*upsample->upmethod) (cinfo, input_buf, *in_row_group_ctr, work_ptrs); + } + + /* Adjust counts */ + *out_row_ctr += num_rows; + upsample->rows_to_go -= num_rows; + /* When the buffer is emptied, declare this input row group consumed */ + if (! upsample->spare_full) + (*in_row_group_ctr)++; +} + + +METHODDEF(void) +merged_1v_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr, + JDIMENSION in_row_groups_avail, + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail) +/* 1:1 vertical sampling case: much easier, never need a spare row. */ +{ + my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample; + + /* Just do the upsampling. */ + (*upsample->upmethod) (cinfo, input_buf, *in_row_group_ctr, + output_buf + *out_row_ctr); + /* Adjust counts */ + (*out_row_ctr)++; + (*in_row_group_ctr)++; +} + + +/* + * These are the routines invoked by the control routines to do + * the actual upsampling/conversion. One row group is processed per call. + * + * Note: since we may be writing directly into application-supplied buffers, + * we have to be honest about the output width; we can't assume the buffer + * has been rounded up to an even width. + */ + + +/* + * Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. + */ + +METHODDEF(void) +h2v1_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + extrgb_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr, + output_buf); + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + extrgbx_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr, + output_buf); + break; + case JCS_EXT_BGR: + extbgr_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr, + output_buf); + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + extbgrx_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr, + output_buf); + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + extxbgr_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr, + output_buf); + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + extxrgb_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr, + output_buf); + break; + default: + h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr, + output_buf); + break; + } +} + + +/* + * Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. + */ + +METHODDEF(void) +h2v2_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + extrgb_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr, + output_buf); + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + extrgbx_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr, + output_buf); + break; + case JCS_EXT_BGR: + extbgr_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr, + output_buf); + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + extbgrx_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr, + output_buf); + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + extxbgr_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr, + output_buf); + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + extxrgb_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr, + output_buf); + break; + default: + h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr, + output_buf); + break; + } +} + + +/* + * RGB565 conversion + */ + +#define PACK_SHORT_565_LE(r, g, b) ((((r) << 8) & 0xF800) | \ + (((g) << 3) & 0x7E0) | ((b) >> 3)) +#define PACK_SHORT_565_BE(r, g, b) (((r) & 0xF8) | ((g) >> 5) | \ + (((g) << 11) & 0xE000) | \ + (((b) << 5) & 0x1F00)) + +#define PACK_TWO_PIXELS_LE(l, r) ((r << 16) | l) +#define PACK_TWO_PIXELS_BE(l, r) ((l << 16) | r) + +#define PACK_NEED_ALIGNMENT(ptr) (((size_t)(ptr)) & 3) + +#define WRITE_TWO_PIXELS_LE(addr, pixels) { \ + ((INT16*)(addr))[0] = (INT16)(pixels); \ + ((INT16*)(addr))[1] = (INT16)((pixels) >> 16); \ +} +#define WRITE_TWO_PIXELS_BE(addr, pixels) { \ + ((INT16*)(addr))[1] = (INT16)(pixels); \ + ((INT16*)(addr))[0] = (INT16)((pixels) >> 16); \ +} + +#define DITHER_565_R(r, dither) ((r) + ((dither) & 0xFF)) +#define DITHER_565_G(g, dither) ((g) + (((dither) & 0xFF) >> 1)) +#define DITHER_565_B(b, dither) ((b) + ((dither) & 0xFF)) + + +/* Declarations for ordered dithering + * + * We use a 4x4 ordered dither array packed into 32 bits. This array is + * sufficent for dithering RGB888 to RGB565. + */ + +#define DITHER_MASK 0x3 +#define DITHER_ROTATE(x) ((((x) & 0xFF) << 24) | (((x) >> 8) & 0x00FFFFFF)) +static const JLONG dither_matrix[4] = { + 0x0008020A, + 0x0C040E06, + 0x030B0109, + 0x0F070D05 +}; + + +/* Include inline routines for RGB565 conversion */ + +#define PACK_SHORT_565 PACK_SHORT_565_LE +#define PACK_TWO_PIXELS PACK_TWO_PIXELS_LE +#define WRITE_TWO_PIXELS WRITE_TWO_PIXELS_LE +#define h2v1_merged_upsample_565_internal h2v1_merged_upsample_565_le +#define h2v1_merged_upsample_565D_internal h2v1_merged_upsample_565D_le +#define h2v2_merged_upsample_565_internal h2v2_merged_upsample_565_le +#define h2v2_merged_upsample_565D_internal h2v2_merged_upsample_565D_le +#include "jdmrg565.c" +#undef PACK_SHORT_565 +#undef PACK_TWO_PIXELS +#undef WRITE_TWO_PIXELS +#undef h2v1_merged_upsample_565_internal +#undef h2v1_merged_upsample_565D_internal +#undef h2v2_merged_upsample_565_internal +#undef h2v2_merged_upsample_565D_internal + +#define PACK_SHORT_565 PACK_SHORT_565_BE +#define PACK_TWO_PIXELS PACK_TWO_PIXELS_BE +#define WRITE_TWO_PIXELS WRITE_TWO_PIXELS_BE +#define h2v1_merged_upsample_565_internal h2v1_merged_upsample_565_be +#define h2v1_merged_upsample_565D_internal h2v1_merged_upsample_565D_be +#define h2v2_merged_upsample_565_internal h2v2_merged_upsample_565_be +#define h2v2_merged_upsample_565D_internal h2v2_merged_upsample_565D_be +#include "jdmrg565.c" +#undef PACK_SHORT_565 +#undef PACK_TWO_PIXELS +#undef WRITE_TWO_PIXELS +#undef h2v1_merged_upsample_565_internal +#undef h2v1_merged_upsample_565D_internal +#undef h2v2_merged_upsample_565_internal +#undef h2v2_merged_upsample_565D_internal + + +static INLINE boolean is_big_endian(void) +{ + int test_value = 1; + if(*(char *)&test_value != 1) + return TRUE; + return FALSE; +} + + +METHODDEF(void) +h2v1_merged_upsample_565 (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + if (is_big_endian()) + h2v1_merged_upsample_565_be(cinfo, input_buf, in_row_group_ctr, + output_buf); + else + h2v1_merged_upsample_565_le(cinfo, input_buf, in_row_group_ctr, + output_buf); + } + + +METHODDEF(void) +h2v1_merged_upsample_565D (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + if (is_big_endian()) + h2v1_merged_upsample_565D_be(cinfo, input_buf, in_row_group_ctr, + output_buf); + else + h2v1_merged_upsample_565D_le(cinfo, input_buf, in_row_group_ctr, + output_buf); +} + + +METHODDEF(void) +h2v2_merged_upsample_565 (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + if (is_big_endian()) + h2v2_merged_upsample_565_be(cinfo, input_buf, in_row_group_ctr, + output_buf); + else + h2v2_merged_upsample_565_le(cinfo, input_buf, in_row_group_ctr, + output_buf); +} + + +METHODDEF(void) +h2v2_merged_upsample_565D (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + if (is_big_endian()) + h2v2_merged_upsample_565D_be(cinfo, input_buf, in_row_group_ctr, + output_buf); + else + h2v2_merged_upsample_565D_le(cinfo, input_buf, in_row_group_ctr, + output_buf); +} + + +/* + * Module initialization routine for merged upsampling/color conversion. + * + * NB: this is called under the conditions determined by use_merged_upsample() + * in jdmaster.c. That routine MUST correspond to the actual capabilities + * of this module; no safety checks are made here. + */ + +GLOBAL(void) +jinit_merged_upsampler (j_decompress_ptr cinfo) +{ + my_upsample_ptr upsample; + + upsample = (my_upsample_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(my_upsampler)); + cinfo->upsample = (struct jpeg_upsampler *) upsample; + upsample->pub.start_pass = start_pass_merged_upsample; + upsample->pub.need_context_rows = FALSE; + + upsample->out_row_width = cinfo->output_width * cinfo->out_color_components; + + if (cinfo->max_v_samp_factor == 2) { + upsample->pub.upsample = merged_2v_upsample; + if (jsimd_can_h2v2_merged_upsample()) + upsample->upmethod = jsimd_h2v2_merged_upsample; + else + upsample->upmethod = h2v2_merged_upsample; + if (cinfo->out_color_space == JCS_RGB565) { + if (cinfo->dither_mode != JDITHER_NONE) { + upsample->upmethod = h2v2_merged_upsample_565D; + } else { + upsample->upmethod = h2v2_merged_upsample_565; + } + } + /* Allocate a spare row buffer */ + upsample->spare_row = (JSAMPROW) + (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE, + (size_t) (upsample->out_row_width * sizeof(JSAMPLE))); + } else { + upsample->pub.upsample = merged_1v_upsample; + if (jsimd_can_h2v1_merged_upsample()) + upsample->upmethod = jsimd_h2v1_merged_upsample; + else + upsample->upmethod = h2v1_merged_upsample; + if (cinfo->out_color_space == JCS_RGB565) { + if (cinfo->dither_mode != JDITHER_NONE) { + upsample->upmethod = h2v1_merged_upsample_565D; + } else { + upsample->upmethod = h2v1_merged_upsample_565; + } + } + /* No spare row needed */ + upsample->spare_row = NULL; + } + + build_ycc_rgb_table(cinfo); +} + +#endif /* UPSAMPLE_MERGING_SUPPORTED */ diff --git a/media/libjpeg/jdmrg565.c b/media/libjpeg/jdmrg565.c new file mode 100644 index 000000000..18287b373 --- /dev/null +++ b/media/libjpeg/jdmrg565.c @@ -0,0 +1,356 @@ +/* + * jdmrg565.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1994-1996, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2013, Linaro Limited. + * Copyright (C) 2014-2015, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains code for merged upsampling/color conversion. + */ + + +INLINE +LOCAL(void) +h2v1_merged_upsample_565_internal (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample; + register int y, cred, cgreen, cblue; + int cb, cr; + register JSAMPROW outptr; + JSAMPROW inptr0, inptr1, inptr2; + JDIMENSION col; + /* copy these pointers into registers if possible */ + register JSAMPLE * range_limit = cinfo->sample_range_limit; + int * Crrtab = upsample->Cr_r_tab; + int * Cbbtab = upsample->Cb_b_tab; + JLONG * Crgtab = upsample->Cr_g_tab; + JLONG * Cbgtab = upsample->Cb_g_tab; + unsigned int r, g, b; + JLONG rgb; + SHIFT_TEMPS + + inptr0 = input_buf[0][in_row_group_ctr]; + inptr1 = input_buf[1][in_row_group_ctr]; + inptr2 = input_buf[2][in_row_group_ctr]; + outptr = output_buf[0]; + + /* Loop for each pair of output pixels */ + for (col = cinfo->output_width >> 1; col > 0; col--) { + /* Do the chroma part of the calculation */ + cb = GETJSAMPLE(*inptr1++); + cr = GETJSAMPLE(*inptr2++); + cred = Crrtab[cr]; + cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS); + cblue = Cbbtab[cb]; + + /* Fetch 2 Y values and emit 2 pixels */ + y = GETJSAMPLE(*inptr0++); + r = range_limit[y + cred]; + g = range_limit[y + cgreen]; + b = range_limit[y + cblue]; + rgb = PACK_SHORT_565(r, g, b); + + y = GETJSAMPLE(*inptr0++); + r = range_limit[y + cred]; + g = range_limit[y + cgreen]; + b = range_limit[y + cblue]; + rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b)); + + WRITE_TWO_PIXELS(outptr, rgb); + outptr += 4; + } + + /* If image width is odd, do the last output column separately */ + if (cinfo->output_width & 1) { + cb = GETJSAMPLE(*inptr1); + cr = GETJSAMPLE(*inptr2); + cred = Crrtab[cr]; + cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS); + cblue = Cbbtab[cb]; + y = GETJSAMPLE(*inptr0); + r = range_limit[y + cred]; + g = range_limit[y + cgreen]; + b = range_limit[y + cblue]; + rgb = PACK_SHORT_565(r, g, b); + *(INT16*)outptr = (INT16)rgb; + } + } + + +INLINE +LOCAL(void) +h2v1_merged_upsample_565D_internal (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample; + register int y, cred, cgreen, cblue; + int cb, cr; + register JSAMPROW outptr; + JSAMPROW inptr0, inptr1, inptr2; + JDIMENSION col; + /* copy these pointers into registers if possible */ + register JSAMPLE * range_limit = cinfo->sample_range_limit; + int * Crrtab = upsample->Cr_r_tab; + int * Cbbtab = upsample->Cb_b_tab; + JLONG * Crgtab = upsample->Cr_g_tab; + JLONG * Cbgtab = upsample->Cb_g_tab; + JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK]; + unsigned int r, g, b; + JLONG rgb; + SHIFT_TEMPS + + inptr0 = input_buf[0][in_row_group_ctr]; + inptr1 = input_buf[1][in_row_group_ctr]; + inptr2 = input_buf[2][in_row_group_ctr]; + outptr = output_buf[0]; + + /* Loop for each pair of output pixels */ + for (col = cinfo->output_width >> 1; col > 0; col--) { + /* Do the chroma part of the calculation */ + cb = GETJSAMPLE(*inptr1++); + cr = GETJSAMPLE(*inptr2++); + cred = Crrtab[cr]; + cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS); + cblue = Cbbtab[cb]; + + /* Fetch 2 Y values and emit 2 pixels */ + y = GETJSAMPLE(*inptr0++); + r = range_limit[DITHER_565_R(y + cred, d0)]; + g = range_limit[DITHER_565_G(y + cgreen, d0)]; + b = range_limit[DITHER_565_B(y + cblue, d0)]; + d0 = DITHER_ROTATE(d0); + rgb = PACK_SHORT_565(r, g, b); + + y = GETJSAMPLE(*inptr0++); + r = range_limit[DITHER_565_R(y + cred, d0)]; + g = range_limit[DITHER_565_G(y + cgreen, d0)]; + b = range_limit[DITHER_565_B(y + cblue, d0)]; + d0 = DITHER_ROTATE(d0); + rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b)); + + WRITE_TWO_PIXELS(outptr, rgb); + outptr += 4; + } + + /* If image width is odd, do the last output column separately */ + if (cinfo->output_width & 1) { + cb = GETJSAMPLE(*inptr1); + cr = GETJSAMPLE(*inptr2); + cred = Crrtab[cr]; + cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS); + cblue = Cbbtab[cb]; + y = GETJSAMPLE(*inptr0); + r = range_limit[DITHER_565_R(y + cred, d0)]; + g = range_limit[DITHER_565_G(y + cgreen, d0)]; + b = range_limit[DITHER_565_B(y + cblue, d0)]; + rgb = PACK_SHORT_565(r, g, b); + *(INT16*)outptr = (INT16)rgb; + } +} + + +INLINE +LOCAL(void) +h2v2_merged_upsample_565_internal (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample; + register int y, cred, cgreen, cblue; + int cb, cr; + register JSAMPROW outptr0, outptr1; + JSAMPROW inptr00, inptr01, inptr1, inptr2; + JDIMENSION col; + /* copy these pointers into registers if possible */ + register JSAMPLE * range_limit = cinfo->sample_range_limit; + int * Crrtab = upsample->Cr_r_tab; + int * Cbbtab = upsample->Cb_b_tab; + JLONG * Crgtab = upsample->Cr_g_tab; + JLONG * Cbgtab = upsample->Cb_g_tab; + unsigned int r, g, b; + JLONG rgb; + SHIFT_TEMPS + + inptr00 = input_buf[0][in_row_group_ctr * 2]; + inptr01 = input_buf[0][in_row_group_ctr * 2 + 1]; + inptr1 = input_buf[1][in_row_group_ctr]; + inptr2 = input_buf[2][in_row_group_ctr]; + outptr0 = output_buf[0]; + outptr1 = output_buf[1]; + + /* Loop for each group of output pixels */ + for (col = cinfo->output_width >> 1; col > 0; col--) { + /* Do the chroma part of the calculation */ + cb = GETJSAMPLE(*inptr1++); + cr = GETJSAMPLE(*inptr2++); + cred = Crrtab[cr]; + cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS); + cblue = Cbbtab[cb]; + + /* Fetch 4 Y values and emit 4 pixels */ + y = GETJSAMPLE(*inptr00++); + r = range_limit[y + cred]; + g = range_limit[y + cgreen]; + b = range_limit[y + cblue]; + rgb = PACK_SHORT_565(r, g, b); + + y = GETJSAMPLE(*inptr00++); + r = range_limit[y + cred]; + g = range_limit[y + cgreen]; + b = range_limit[y + cblue]; + rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b)); + + WRITE_TWO_PIXELS(outptr0, rgb); + outptr0 += 4; + + y = GETJSAMPLE(*inptr01++); + r = range_limit[y + cred]; + g = range_limit[y + cgreen]; + b = range_limit[y + cblue]; + rgb = PACK_SHORT_565(r, g, b); + + y = GETJSAMPLE(*inptr01++); + r = range_limit[y + cred]; + g = range_limit[y + cgreen]; + b = range_limit[y + cblue]; + rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b)); + + WRITE_TWO_PIXELS(outptr1, rgb); + outptr1 += 4; + } + + /* If image width is odd, do the last output column separately */ + if (cinfo->output_width & 1) { + cb = GETJSAMPLE(*inptr1); + cr = GETJSAMPLE(*inptr2); + cred = Crrtab[cr]; + cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS); + cblue = Cbbtab[cb]; + + y = GETJSAMPLE(*inptr00); + r = range_limit[y + cred]; + g = range_limit[y + cgreen]; + b = range_limit[y + cblue]; + rgb = PACK_SHORT_565(r, g, b); + *(INT16*)outptr0 = (INT16)rgb; + + y = GETJSAMPLE(*inptr01); + r = range_limit[y + cred]; + g = range_limit[y + cgreen]; + b = range_limit[y + cblue]; + rgb = PACK_SHORT_565(r, g, b); + *(INT16*)outptr1 = (INT16)rgb; + } +} + + +INLINE +LOCAL(void) +h2v2_merged_upsample_565D_internal (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample; + register int y, cred, cgreen, cblue; + int cb, cr; + register JSAMPROW outptr0, outptr1; + JSAMPROW inptr00, inptr01, inptr1, inptr2; + JDIMENSION col; + /* copy these pointers into registers if possible */ + register JSAMPLE * range_limit = cinfo->sample_range_limit; + int * Crrtab = upsample->Cr_r_tab; + int * Cbbtab = upsample->Cb_b_tab; + JLONG * Crgtab = upsample->Cr_g_tab; + JLONG * Cbgtab = upsample->Cb_g_tab; + JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK]; + JLONG d1 = dither_matrix[(cinfo->output_scanline+1) & DITHER_MASK]; + unsigned int r, g, b; + JLONG rgb; + SHIFT_TEMPS + + inptr00 = input_buf[0][in_row_group_ctr*2]; + inptr01 = input_buf[0][in_row_group_ctr*2 + 1]; + inptr1 = input_buf[1][in_row_group_ctr]; + inptr2 = input_buf[2][in_row_group_ctr]; + outptr0 = output_buf[0]; + outptr1 = output_buf[1]; + + /* Loop for each group of output pixels */ + for (col = cinfo->output_width >> 1; col > 0; col--) { + /* Do the chroma part of the calculation */ + cb = GETJSAMPLE(*inptr1++); + cr = GETJSAMPLE(*inptr2++); + cred = Crrtab[cr]; + cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS); + cblue = Cbbtab[cb]; + + /* Fetch 4 Y values and emit 4 pixels */ + y = GETJSAMPLE(*inptr00++); + r = range_limit[DITHER_565_R(y + cred, d0)]; + g = range_limit[DITHER_565_G(y + cgreen, d0)]; + b = range_limit[DITHER_565_B(y + cblue, d0)]; + d0 = DITHER_ROTATE(d0); + rgb = PACK_SHORT_565(r, g, b); + + y = GETJSAMPLE(*inptr00++); + r = range_limit[DITHER_565_R(y + cred, d1)]; + g = range_limit[DITHER_565_G(y + cgreen, d1)]; + b = range_limit[DITHER_565_B(y + cblue, d1)]; + d1 = DITHER_ROTATE(d1); + rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b)); + + WRITE_TWO_PIXELS(outptr0, rgb); + outptr0 += 4; + + y = GETJSAMPLE(*inptr01++); + r = range_limit[DITHER_565_R(y + cred, d0)]; + g = range_limit[DITHER_565_G(y + cgreen, d0)]; + b = range_limit[DITHER_565_B(y + cblue, d0)]; + d0 = DITHER_ROTATE(d0); + rgb = PACK_SHORT_565(r, g, b); + + y = GETJSAMPLE(*inptr01++); + r = range_limit[DITHER_565_R(y + cred, d1)]; + g = range_limit[DITHER_565_G(y + cgreen, d1)]; + b = range_limit[DITHER_565_B(y + cblue, d1)]; + d1 = DITHER_ROTATE(d1); + rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b)); + + WRITE_TWO_PIXELS(outptr1, rgb); + outptr1 += 4; + } + + /* If image width is odd, do the last output column separately */ + if (cinfo->output_width & 1) { + cb = GETJSAMPLE(*inptr1); + cr = GETJSAMPLE(*inptr2); + cred = Crrtab[cr]; + cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS); + cblue = Cbbtab[cb]; + + y = GETJSAMPLE(*inptr00); + r = range_limit[DITHER_565_R(y + cred, d0)]; + g = range_limit[DITHER_565_G(y + cgreen, d0)]; + b = range_limit[DITHER_565_B(y + cblue, d0)]; + rgb = PACK_SHORT_565(r, g, b); + *(INT16*)outptr0 = (INT16)rgb; + + y = GETJSAMPLE(*inptr01); + r = range_limit[DITHER_565_R(y + cred, d1)]; + g = range_limit[DITHER_565_G(y + cgreen, d1)]; + b = range_limit[DITHER_565_B(y + cblue, d1)]; + rgb = PACK_SHORT_565(r, g, b); + *(INT16*)outptr1 = (INT16)rgb; + } +} diff --git a/media/libjpeg/jdmrgext.c b/media/libjpeg/jdmrgext.c new file mode 100644 index 000000000..9d7d2af2e --- /dev/null +++ b/media/libjpeg/jdmrgext.c @@ -0,0 +1,186 @@ +/* + * jdmrgext.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1994-1996, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2011, 2015, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains code for merged upsampling/color conversion. + */ + + +/* This file is included by jdmerge.c */ + + +/* + * Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. + */ + +INLINE +LOCAL(void) +h2v1_merged_upsample_internal (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample; + register int y, cred, cgreen, cblue; + int cb, cr; + register JSAMPROW outptr; + JSAMPROW inptr0, inptr1, inptr2; + JDIMENSION col; + /* copy these pointers into registers if possible */ + register JSAMPLE * range_limit = cinfo->sample_range_limit; + int * Crrtab = upsample->Cr_r_tab; + int * Cbbtab = upsample->Cb_b_tab; + JLONG * Crgtab = upsample->Cr_g_tab; + JLONG * Cbgtab = upsample->Cb_g_tab; + SHIFT_TEMPS + + inptr0 = input_buf[0][in_row_group_ctr]; + inptr1 = input_buf[1][in_row_group_ctr]; + inptr2 = input_buf[2][in_row_group_ctr]; + outptr = output_buf[0]; + /* Loop for each pair of output pixels */ + for (col = cinfo->output_width >> 1; col > 0; col--) { + /* Do the chroma part of the calculation */ + cb = GETJSAMPLE(*inptr1++); + cr = GETJSAMPLE(*inptr2++); + cred = Crrtab[cr]; + cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS); + cblue = Cbbtab[cb]; + /* Fetch 2 Y values and emit 2 pixels */ + y = GETJSAMPLE(*inptr0++); + outptr[RGB_RED] = range_limit[y + cred]; + outptr[RGB_GREEN] = range_limit[y + cgreen]; + outptr[RGB_BLUE] = range_limit[y + cblue]; +#ifdef RGB_ALPHA + outptr[RGB_ALPHA] = 0xFF; +#endif + outptr += RGB_PIXELSIZE; + y = GETJSAMPLE(*inptr0++); + outptr[RGB_RED] = range_limit[y + cred]; + outptr[RGB_GREEN] = range_limit[y + cgreen]; + outptr[RGB_BLUE] = range_limit[y + cblue]; +#ifdef RGB_ALPHA + outptr[RGB_ALPHA] = 0xFF; +#endif + outptr += RGB_PIXELSIZE; + } + /* If image width is odd, do the last output column separately */ + if (cinfo->output_width & 1) { + cb = GETJSAMPLE(*inptr1); + cr = GETJSAMPLE(*inptr2); + cred = Crrtab[cr]; + cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS); + cblue = Cbbtab[cb]; + y = GETJSAMPLE(*inptr0); + outptr[RGB_RED] = range_limit[y + cred]; + outptr[RGB_GREEN] = range_limit[y + cgreen]; + outptr[RGB_BLUE] = range_limit[y + cblue]; +#ifdef RGB_ALPHA + outptr[RGB_ALPHA] = 0xFF; +#endif + } +} + + +/* + * Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. + */ + +INLINE +LOCAL(void) +h2v2_merged_upsample_internal (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample; + register int y, cred, cgreen, cblue; + int cb, cr; + register JSAMPROW outptr0, outptr1; + JSAMPROW inptr00, inptr01, inptr1, inptr2; + JDIMENSION col; + /* copy these pointers into registers if possible */ + register JSAMPLE * range_limit = cinfo->sample_range_limit; + int * Crrtab = upsample->Cr_r_tab; + int * Cbbtab = upsample->Cb_b_tab; + JLONG * Crgtab = upsample->Cr_g_tab; + JLONG * Cbgtab = upsample->Cb_g_tab; + SHIFT_TEMPS + + inptr00 = input_buf[0][in_row_group_ctr*2]; + inptr01 = input_buf[0][in_row_group_ctr*2 + 1]; + inptr1 = input_buf[1][in_row_group_ctr]; + inptr2 = input_buf[2][in_row_group_ctr]; + outptr0 = output_buf[0]; + outptr1 = output_buf[1]; + /* Loop for each group of output pixels */ + for (col = cinfo->output_width >> 1; col > 0; col--) { + /* Do the chroma part of the calculation */ + cb = GETJSAMPLE(*inptr1++); + cr = GETJSAMPLE(*inptr2++); + cred = Crrtab[cr]; + cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS); + cblue = Cbbtab[cb]; + /* Fetch 4 Y values and emit 4 pixels */ + y = GETJSAMPLE(*inptr00++); + outptr0[RGB_RED] = range_limit[y + cred]; + outptr0[RGB_GREEN] = range_limit[y + cgreen]; + outptr0[RGB_BLUE] = range_limit[y + cblue]; +#ifdef RGB_ALPHA + outptr0[RGB_ALPHA] = 0xFF; +#endif + outptr0 += RGB_PIXELSIZE; + y = GETJSAMPLE(*inptr00++); + outptr0[RGB_RED] = range_limit[y + cred]; + outptr0[RGB_GREEN] = range_limit[y + cgreen]; + outptr0[RGB_BLUE] = range_limit[y + cblue]; +#ifdef RGB_ALPHA + outptr0[RGB_ALPHA] = 0xFF; +#endif + outptr0 += RGB_PIXELSIZE; + y = GETJSAMPLE(*inptr01++); + outptr1[RGB_RED] = range_limit[y + cred]; + outptr1[RGB_GREEN] = range_limit[y + cgreen]; + outptr1[RGB_BLUE] = range_limit[y + cblue]; +#ifdef RGB_ALPHA + outptr1[RGB_ALPHA] = 0xFF; +#endif + outptr1 += RGB_PIXELSIZE; + y = GETJSAMPLE(*inptr01++); + outptr1[RGB_RED] = range_limit[y + cred]; + outptr1[RGB_GREEN] = range_limit[y + cgreen]; + outptr1[RGB_BLUE] = range_limit[y + cblue]; +#ifdef RGB_ALPHA + outptr1[RGB_ALPHA] = 0xFF; +#endif + outptr1 += RGB_PIXELSIZE; + } + /* If image width is odd, do the last output column separately */ + if (cinfo->output_width & 1) { + cb = GETJSAMPLE(*inptr1); + cr = GETJSAMPLE(*inptr2); + cred = Crrtab[cr]; + cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS); + cblue = Cbbtab[cb]; + y = GETJSAMPLE(*inptr00); + outptr0[RGB_RED] = range_limit[y + cred]; + outptr0[RGB_GREEN] = range_limit[y + cgreen]; + outptr0[RGB_BLUE] = range_limit[y + cblue]; +#ifdef RGB_ALPHA + outptr0[RGB_ALPHA] = 0xFF; +#endif + y = GETJSAMPLE(*inptr01); + outptr1[RGB_RED] = range_limit[y + cred]; + outptr1[RGB_GREEN] = range_limit[y + cgreen]; + outptr1[RGB_BLUE] = range_limit[y + cblue]; +#ifdef RGB_ALPHA + outptr1[RGB_ALPHA] = 0xFF; +#endif + } +} diff --git a/media/libjpeg/jdphuff.c b/media/libjpeg/jdphuff.c new file mode 100644 index 000000000..c927ffa07 --- /dev/null +++ b/media/libjpeg/jdphuff.c @@ -0,0 +1,674 @@ +/* + * jdphuff.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1995-1997, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2015-2016, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains Huffman entropy decoding routines for progressive JPEG. + * + * Much of the complexity here has to do with supporting input suspension. + * If the data source module demands suspension, we want to be able to back + * up to the start of the current MCU. To do this, we copy state variables + * into local working storage, and update them back to the permanent + * storage only upon successful completion of an MCU. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jdhuff.h" /* Declarations shared with jdhuff.c */ + + +#ifdef D_PROGRESSIVE_SUPPORTED + +/* + * Expanded entropy decoder object for progressive Huffman decoding. + * + * The savable_state subrecord contains fields that change within an MCU, + * but must not be updated permanently until we complete the MCU. + */ + +typedef struct { + unsigned int EOBRUN; /* remaining EOBs in EOBRUN */ + int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */ +} savable_state; + +/* This macro is to work around compilers with missing or broken + * structure assignment. You'll need to fix this code if you have + * such a compiler and you change MAX_COMPS_IN_SCAN. + */ + +#ifndef NO_STRUCT_ASSIGN +#define ASSIGN_STATE(dest,src) ((dest) = (src)) +#else +#if MAX_COMPS_IN_SCAN == 4 +#define ASSIGN_STATE(dest,src) \ + ((dest).EOBRUN = (src).EOBRUN, \ + (dest).last_dc_val[0] = (src).last_dc_val[0], \ + (dest).last_dc_val[1] = (src).last_dc_val[1], \ + (dest).last_dc_val[2] = (src).last_dc_val[2], \ + (dest).last_dc_val[3] = (src).last_dc_val[3]) +#endif +#endif + + +typedef struct { + struct jpeg_entropy_decoder pub; /* public fields */ + + /* These fields are loaded into local variables at start of each MCU. + * In case of suspension, we exit WITHOUT updating them. + */ + bitread_perm_state bitstate; /* Bit buffer at start of MCU */ + savable_state saved; /* Other state at start of MCU */ + + /* These fields are NOT loaded into local working state. */ + unsigned int restarts_to_go; /* MCUs left in this restart interval */ + + /* Pointers to derived tables (these workspaces have image lifespan) */ + d_derived_tbl *derived_tbls[NUM_HUFF_TBLS]; + + d_derived_tbl *ac_derived_tbl; /* active table during an AC scan */ +} phuff_entropy_decoder; + +typedef phuff_entropy_decoder *phuff_entropy_ptr; + +/* Forward declarations */ +METHODDEF(boolean) decode_mcu_DC_first (j_decompress_ptr cinfo, + JBLOCKROW *MCU_data); +METHODDEF(boolean) decode_mcu_AC_first (j_decompress_ptr cinfo, + JBLOCKROW *MCU_data); +METHODDEF(boolean) decode_mcu_DC_refine (j_decompress_ptr cinfo, + JBLOCKROW *MCU_data); +METHODDEF(boolean) decode_mcu_AC_refine (j_decompress_ptr cinfo, + JBLOCKROW *MCU_data); + + +/* + * Initialize for a Huffman-compressed scan. + */ + +METHODDEF(void) +start_pass_phuff_decoder (j_decompress_ptr cinfo) +{ + phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; + boolean is_DC_band, bad; + int ci, coefi, tbl; + d_derived_tbl **pdtbl; + int *coef_bit_ptr; + jpeg_component_info *compptr; + + is_DC_band = (cinfo->Ss == 0); + + /* Validate scan parameters */ + bad = FALSE; + if (is_DC_band) { + if (cinfo->Se != 0) + bad = TRUE; + } else { + /* need not check Ss/Se < 0 since they came from unsigned bytes */ + if (cinfo->Ss > cinfo->Se || cinfo->Se >= DCTSIZE2) + bad = TRUE; + /* AC scans may have only one component */ + if (cinfo->comps_in_scan != 1) + bad = TRUE; + } + if (cinfo->Ah != 0) { + /* Successive approximation refinement scan: must have Al = Ah-1. */ + if (cinfo->Al != cinfo->Ah-1) + bad = TRUE; + } + if (cinfo->Al > 13) /* need not check for < 0 */ + bad = TRUE; + /* Arguably the maximum Al value should be less than 13 for 8-bit precision, + * but the spec doesn't say so, and we try to be liberal about what we + * accept. Note: large Al values could result in out-of-range DC + * coefficients during early scans, leading to bizarre displays due to + * overflows in the IDCT math. But we won't crash. + */ + if (bad) + ERREXIT4(cinfo, JERR_BAD_PROGRESSION, + cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al); + /* Update progression status, and verify that scan order is legal. + * Note that inter-scan inconsistencies are treated as warnings + * not fatal errors ... not clear if this is right way to behave. + */ + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { + int cindex = cinfo->cur_comp_info[ci]->component_index; + coef_bit_ptr = & cinfo->coef_bits[cindex][0]; + if (!is_DC_band && coef_bit_ptr[0] < 0) /* AC without prior DC scan */ + WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0); + for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) { + int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi]; + if (cinfo->Ah != expected) + WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi); + coef_bit_ptr[coefi] = cinfo->Al; + } + } + + /* Select MCU decoding routine */ + if (cinfo->Ah == 0) { + if (is_DC_band) + entropy->pub.decode_mcu = decode_mcu_DC_first; + else + entropy->pub.decode_mcu = decode_mcu_AC_first; + } else { + if (is_DC_band) + entropy->pub.decode_mcu = decode_mcu_DC_refine; + else + entropy->pub.decode_mcu = decode_mcu_AC_refine; + } + + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { + compptr = cinfo->cur_comp_info[ci]; + /* Make sure requested tables are present, and compute derived tables. + * We may build same derived table more than once, but it's not expensive. + */ + if (is_DC_band) { + if (cinfo->Ah == 0) { /* DC refinement needs no table */ + tbl = compptr->dc_tbl_no; + pdtbl = (d_derived_tbl **)(entropy->derived_tbls) + tbl; + jpeg_make_d_derived_tbl(cinfo, TRUE, tbl, pdtbl); + } + } else { + tbl = compptr->ac_tbl_no; + pdtbl = (d_derived_tbl **)(entropy->derived_tbls) + tbl; + jpeg_make_d_derived_tbl(cinfo, FALSE, tbl, pdtbl); + /* remember the single active table */ + entropy->ac_derived_tbl = entropy->derived_tbls[tbl]; + } + /* Initialize DC predictions to 0 */ + entropy->saved.last_dc_val[ci] = 0; + } + + /* Initialize bitread state variables */ + entropy->bitstate.bits_left = 0; + entropy->bitstate.get_buffer = 0; /* unnecessary, but keeps Purify quiet */ + entropy->pub.insufficient_data = FALSE; + + /* Initialize private state variables */ + entropy->saved.EOBRUN = 0; + + /* Initialize restart counter */ + entropy->restarts_to_go = cinfo->restart_interval; +} + + +/* + * Figure F.12: extend sign bit. + * On some machines, a shift and add will be faster than a table lookup. + */ + +#define AVOID_TABLES +#ifdef AVOID_TABLES + +#define NEG_1 ((unsigned)-1) +#define HUFF_EXTEND(x,s) ((x) < (1<<((s)-1)) ? (x) + (((NEG_1)<<(s)) + 1) : (x)) + +#else + +#define HUFF_EXTEND(x,s) ((x) < extend_test[s] ? (x) + extend_offset[s] : (x)) + +static const int extend_test[16] = /* entry n is 2**(n-1) */ + { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, + 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 }; + +static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */ + { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1, + ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1, + ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1, + ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 }; + +#endif /* AVOID_TABLES */ + + +/* + * Check for a restart marker & resynchronize decoder. + * Returns FALSE if must suspend. + */ + +LOCAL(boolean) +process_restart (j_decompress_ptr cinfo) +{ + phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; + int ci; + + /* Throw away any unused bits remaining in bit buffer; */ + /* include any full bytes in next_marker's count of discarded bytes */ + cinfo->marker->discarded_bytes += entropy->bitstate.bits_left / 8; + entropy->bitstate.bits_left = 0; + + /* Advance past the RSTn marker */ + if (! (*cinfo->marker->read_restart_marker) (cinfo)) + return FALSE; + + /* Re-initialize DC predictions to 0 */ + for (ci = 0; ci < cinfo->comps_in_scan; ci++) + entropy->saved.last_dc_val[ci] = 0; + /* Re-init EOB run count, too */ + entropy->saved.EOBRUN = 0; + + /* Reset restart counter */ + entropy->restarts_to_go = cinfo->restart_interval; + + /* Reset out-of-data flag, unless read_restart_marker left us smack up + * against a marker. In that case we will end up treating the next data + * segment as empty, and we can avoid producing bogus output pixels by + * leaving the flag set. + */ + if (cinfo->unread_marker == 0) + entropy->pub.insufficient_data = FALSE; + + return TRUE; +} + + +/* + * Huffman MCU decoding. + * Each of these routines decodes and returns one MCU's worth of + * Huffman-compressed coefficients. + * The coefficients are reordered from zigzag order into natural array order, + * but are not dequantized. + * + * The i'th block of the MCU is stored into the block pointed to by + * MCU_data[i]. WE ASSUME THIS AREA IS INITIALLY ZEROED BY THE CALLER. + * + * We return FALSE if data source requested suspension. In that case no + * changes have been made to permanent state. (Exception: some output + * coefficients may already have been assigned. This is harmless for + * spectral selection, since we'll just re-assign them on the next call. + * Successive approximation AC refinement has to be more careful, however.) + */ + +/* + * MCU decoding for DC initial scan (either spectral selection, + * or first pass of successive approximation). + */ + +METHODDEF(boolean) +decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) +{ + phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; + int Al = cinfo->Al; + register int s, r; + int blkn, ci; + JBLOCKROW block; + BITREAD_STATE_VARS; + savable_state state; + d_derived_tbl *tbl; + jpeg_component_info *compptr; + + /* Process restart marker if needed; may have to suspend */ + if (cinfo->restart_interval) { + if (entropy->restarts_to_go == 0) + if (! process_restart(cinfo)) + return FALSE; + } + + /* If we've run out of data, just leave the MCU set to zeroes. + * This way, we return uniform gray for the remainder of the segment. + */ + if (! entropy->pub.insufficient_data) { + + /* Load up working state */ + BITREAD_LOAD_STATE(cinfo,entropy->bitstate); + ASSIGN_STATE(state, entropy->saved); + + /* Outer loop handles each block in the MCU */ + + for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) { + block = MCU_data[blkn]; + ci = cinfo->MCU_membership[blkn]; + compptr = cinfo->cur_comp_info[ci]; + tbl = entropy->derived_tbls[compptr->dc_tbl_no]; + + /* Decode a single block's worth of coefficients */ + + /* Section F.2.2.1: decode the DC coefficient difference */ + HUFF_DECODE(s, br_state, tbl, return FALSE, label1); + if (s) { + CHECK_BIT_BUFFER(br_state, s, return FALSE); + r = GET_BITS(s); + s = HUFF_EXTEND(r, s); + } + + /* Convert DC difference to actual value, update last_dc_val */ + s += state.last_dc_val[ci]; + state.last_dc_val[ci] = s; + /* Scale and output the coefficient (assumes jpeg_natural_order[0]=0) */ + (*block)[0] = (JCOEF) LEFT_SHIFT(s, Al); + } + + /* Completed MCU, so update state */ + BITREAD_SAVE_STATE(cinfo,entropy->bitstate); + ASSIGN_STATE(entropy->saved, state); + } + + /* Account for restart interval (no-op if not using restarts) */ + entropy->restarts_to_go--; + + return TRUE; +} + + +/* + * MCU decoding for AC initial scan (either spectral selection, + * or first pass of successive approximation). + */ + +METHODDEF(boolean) +decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) +{ + phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; + int Se = cinfo->Se; + int Al = cinfo->Al; + register int s, k, r; + unsigned int EOBRUN; + JBLOCKROW block; + BITREAD_STATE_VARS; + d_derived_tbl *tbl; + + /* Process restart marker if needed; may have to suspend */ + if (cinfo->restart_interval) { + if (entropy->restarts_to_go == 0) + if (! process_restart(cinfo)) + return FALSE; + } + + /* If we've run out of data, just leave the MCU set to zeroes. + * This way, we return uniform gray for the remainder of the segment. + */ + if (! entropy->pub.insufficient_data) { + + /* Load up working state. + * We can avoid loading/saving bitread state if in an EOB run. + */ + EOBRUN = entropy->saved.EOBRUN; /* only part of saved state we need */ + + /* There is always only one block per MCU */ + + if (EOBRUN > 0) /* if it's a band of zeroes... */ + EOBRUN--; /* ...process it now (we do nothing) */ + else { + BITREAD_LOAD_STATE(cinfo,entropy->bitstate); + block = MCU_data[0]; + tbl = entropy->ac_derived_tbl; + + for (k = cinfo->Ss; k <= Se; k++) { + HUFF_DECODE(s, br_state, tbl, return FALSE, label2); + r = s >> 4; + s &= 15; + if (s) { + k += r; + CHECK_BIT_BUFFER(br_state, s, return FALSE); + r = GET_BITS(s); + s = HUFF_EXTEND(r, s); + /* Scale and output coefficient in natural (dezigzagged) order */ + (*block)[jpeg_natural_order[k]] = (JCOEF) LEFT_SHIFT(s, Al); + } else { + if (r == 15) { /* ZRL */ + k += 15; /* skip 15 zeroes in band */ + } else { /* EOBr, run length is 2^r + appended bits */ + EOBRUN = 1 << r; + if (r) { /* EOBr, r > 0 */ + CHECK_BIT_BUFFER(br_state, r, return FALSE); + r = GET_BITS(r); + EOBRUN += r; + } + EOBRUN--; /* this band is processed at this moment */ + break; /* force end-of-band */ + } + } + } + + BITREAD_SAVE_STATE(cinfo,entropy->bitstate); + } + + /* Completed MCU, so update state */ + entropy->saved.EOBRUN = EOBRUN; /* only part of saved state we need */ + } + + /* Account for restart interval (no-op if not using restarts) */ + entropy->restarts_to_go--; + + return TRUE; +} + + +/* + * MCU decoding for DC successive approximation refinement scan. + * Note: we assume such scans can be multi-component, although the spec + * is not very clear on the point. + */ + +METHODDEF(boolean) +decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) +{ + phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; + int p1 = 1 << cinfo->Al; /* 1 in the bit position being coded */ + int blkn; + JBLOCKROW block; + BITREAD_STATE_VARS; + + /* Process restart marker if needed; may have to suspend */ + if (cinfo->restart_interval) { + if (entropy->restarts_to_go == 0) + if (! process_restart(cinfo)) + return FALSE; + } + + /* Not worth the cycles to check insufficient_data here, + * since we will not change the data anyway if we read zeroes. + */ + + /* Load up working state */ + BITREAD_LOAD_STATE(cinfo,entropy->bitstate); + + /* Outer loop handles each block in the MCU */ + + for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) { + block = MCU_data[blkn]; + + /* Encoded data is simply the next bit of the two's-complement DC value */ + CHECK_BIT_BUFFER(br_state, 1, return FALSE); + if (GET_BITS(1)) + (*block)[0] |= p1; + /* Note: since we use |=, repeating the assignment later is safe */ + } + + /* Completed MCU, so update state */ + BITREAD_SAVE_STATE(cinfo,entropy->bitstate); + + /* Account for restart interval (no-op if not using restarts) */ + entropy->restarts_to_go--; + + return TRUE; +} + + +/* + * MCU decoding for AC successive approximation refinement scan. + */ + +METHODDEF(boolean) +decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) +{ + phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; + int Se = cinfo->Se; + int p1 = 1 << cinfo->Al; /* 1 in the bit position being coded */ + int m1 = (NEG_1) << cinfo->Al; /* -1 in the bit position being coded */ + register int s, k, r; + unsigned int EOBRUN; + JBLOCKROW block; + JCOEFPTR thiscoef; + BITREAD_STATE_VARS; + d_derived_tbl *tbl; + int num_newnz; + int newnz_pos[DCTSIZE2]; + + /* Process restart marker if needed; may have to suspend */ + if (cinfo->restart_interval) { + if (entropy->restarts_to_go == 0) + if (! process_restart(cinfo)) + return FALSE; + } + + /* If we've run out of data, don't modify the MCU. + */ + if (! entropy->pub.insufficient_data) { + + /* Load up working state */ + BITREAD_LOAD_STATE(cinfo,entropy->bitstate); + EOBRUN = entropy->saved.EOBRUN; /* only part of saved state we need */ + + /* There is always only one block per MCU */ + block = MCU_data[0]; + tbl = entropy->ac_derived_tbl; + + /* If we are forced to suspend, we must undo the assignments to any newly + * nonzero coefficients in the block, because otherwise we'd get confused + * next time about which coefficients were already nonzero. + * But we need not undo addition of bits to already-nonzero coefficients; + * instead, we can test the current bit to see if we already did it. + */ + num_newnz = 0; + + /* initialize coefficient loop counter to start of band */ + k = cinfo->Ss; + + if (EOBRUN == 0) { + for (; k <= Se; k++) { + HUFF_DECODE(s, br_state, tbl, goto undoit, label3); + r = s >> 4; + s &= 15; + if (s) { + if (s != 1) /* size of new coef should always be 1 */ + WARNMS(cinfo, JWRN_HUFF_BAD_CODE); + CHECK_BIT_BUFFER(br_state, 1, goto undoit); + if (GET_BITS(1)) + s = p1; /* newly nonzero coef is positive */ + else + s = m1; /* newly nonzero coef is negative */ + } else { + if (r != 15) { + EOBRUN = 1 << r; /* EOBr, run length is 2^r + appended bits */ + if (r) { + CHECK_BIT_BUFFER(br_state, r, goto undoit); + r = GET_BITS(r); + EOBRUN += r; + } + break; /* rest of block is handled by EOB logic */ + } + /* note s = 0 for processing ZRL */ + } + /* Advance over already-nonzero coefs and r still-zero coefs, + * appending correction bits to the nonzeroes. A correction bit is 1 + * if the absolute value of the coefficient must be increased. + */ + do { + thiscoef = *block + jpeg_natural_order[k]; + if (*thiscoef != 0) { + CHECK_BIT_BUFFER(br_state, 1, goto undoit); + if (GET_BITS(1)) { + if ((*thiscoef & p1) == 0) { /* do nothing if already set it */ + if (*thiscoef >= 0) + *thiscoef += p1; + else + *thiscoef += m1; + } + } + } else { + if (--r < 0) + break; /* reached target zero coefficient */ + } + k++; + } while (k <= Se); + if (s) { + int pos = jpeg_natural_order[k]; + /* Output newly nonzero coefficient */ + (*block)[pos] = (JCOEF) s; + /* Remember its position in case we have to suspend */ + newnz_pos[num_newnz++] = pos; + } + } + } + + if (EOBRUN > 0) { + /* Scan any remaining coefficient positions after the end-of-band + * (the last newly nonzero coefficient, if any). Append a correction + * bit to each already-nonzero coefficient. A correction bit is 1 + * if the absolute value of the coefficient must be increased. + */ + for (; k <= Se; k++) { + thiscoef = *block + jpeg_natural_order[k]; + if (*thiscoef != 0) { + CHECK_BIT_BUFFER(br_state, 1, goto undoit); + if (GET_BITS(1)) { + if ((*thiscoef & p1) == 0) { /* do nothing if already changed it */ + if (*thiscoef >= 0) + *thiscoef += p1; + else + *thiscoef += m1; + } + } + } + } + /* Count one block completed in EOB run */ + EOBRUN--; + } + + /* Completed MCU, so update state */ + BITREAD_SAVE_STATE(cinfo,entropy->bitstate); + entropy->saved.EOBRUN = EOBRUN; /* only part of saved state we need */ + } + + /* Account for restart interval (no-op if not using restarts) */ + entropy->restarts_to_go--; + + return TRUE; + +undoit: + /* Re-zero any output coefficients that we made newly nonzero */ + while (num_newnz > 0) + (*block)[newnz_pos[--num_newnz]] = 0; + + return FALSE; +} + + +/* + * Module initialization routine for progressive Huffman entropy decoding. + */ + +GLOBAL(void) +jinit_phuff_decoder (j_decompress_ptr cinfo) +{ + phuff_entropy_ptr entropy; + int *coef_bit_ptr; + int ci, i; + + entropy = (phuff_entropy_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(phuff_entropy_decoder)); + cinfo->entropy = (struct jpeg_entropy_decoder *) entropy; + entropy->pub.start_pass = start_pass_phuff_decoder; + + /* Mark derived tables unallocated */ + for (i = 0; i < NUM_HUFF_TBLS; i++) { + entropy->derived_tbls[i] = NULL; + } + + /* Create progression status table */ + cinfo->coef_bits = (int (*)[DCTSIZE2]) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + cinfo->num_components*DCTSIZE2*sizeof(int)); + coef_bit_ptr = & cinfo->coef_bits[0][0]; + for (ci = 0; ci < cinfo->num_components; ci++) + for (i = 0; i < DCTSIZE2; i++) + *coef_bit_ptr++ = -1; +} + +#endif /* D_PROGRESSIVE_SUPPORTED */ diff --git a/media/libjpeg/jdpostct.c b/media/libjpeg/jdpostct.c new file mode 100644 index 000000000..601fc2a79 --- /dev/null +++ b/media/libjpeg/jdpostct.c @@ -0,0 +1,290 @@ +/* + * jdpostct.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1994-1996, Thomas G. Lane. + * It was modified by The libjpeg-turbo Project to include only code relevant + * to libjpeg-turbo. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains the decompression postprocessing controller. + * This controller manages the upsampling, color conversion, and color + * quantization/reduction steps; specifically, it controls the buffering + * between upsample/color conversion and color quantization/reduction. + * + * If no color quantization/reduction is required, then this module has no + * work to do, and it just hands off to the upsample/color conversion code. + * An integrated upsample/convert/quantize process would replace this module + * entirely. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" + + +/* Private buffer controller object */ + +typedef struct { + struct jpeg_d_post_controller pub; /* public fields */ + + /* Color quantization source buffer: this holds output data from + * the upsample/color conversion step to be passed to the quantizer. + * For two-pass color quantization, we need a full-image buffer; + * for one-pass operation, a strip buffer is sufficient. + */ + jvirt_sarray_ptr whole_image; /* virtual array, or NULL if one-pass */ + JSAMPARRAY buffer; /* strip buffer, or current strip of virtual */ + JDIMENSION strip_height; /* buffer size in rows */ + /* for two-pass mode only: */ + JDIMENSION starting_row; /* row # of first row in current strip */ + JDIMENSION next_row; /* index of next row to fill/empty in strip */ +} my_post_controller; + +typedef my_post_controller *my_post_ptr; + + +/* Forward declarations */ +METHODDEF(void) post_process_1pass + (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION *in_row_group_ctr, JDIMENSION in_row_groups_avail, + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail); +#ifdef QUANT_2PASS_SUPPORTED +METHODDEF(void) post_process_prepass + (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION *in_row_group_ctr, JDIMENSION in_row_groups_avail, + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail); +METHODDEF(void) post_process_2pass + (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION *in_row_group_ctr, JDIMENSION in_row_groups_avail, + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail); +#endif + + +/* + * Initialize for a processing pass. + */ + +METHODDEF(void) +start_pass_dpost (j_decompress_ptr cinfo, J_BUF_MODE pass_mode) +{ + my_post_ptr post = (my_post_ptr) cinfo->post; + + switch (pass_mode) { + case JBUF_PASS_THRU: + if (cinfo->quantize_colors) { + /* Single-pass processing with color quantization. */ + post->pub.post_process_data = post_process_1pass; + /* We could be doing buffered-image output before starting a 2-pass + * color quantization; in that case, jinit_d_post_controller did not + * allocate a strip buffer. Use the virtual-array buffer as workspace. + */ + if (post->buffer == NULL) { + post->buffer = (*cinfo->mem->access_virt_sarray) + ((j_common_ptr) cinfo, post->whole_image, + (JDIMENSION) 0, post->strip_height, TRUE); + } + } else { + /* For single-pass processing without color quantization, + * I have no work to do; just call the upsampler directly. + */ + post->pub.post_process_data = cinfo->upsample->upsample; + } + break; +#ifdef QUANT_2PASS_SUPPORTED + case JBUF_SAVE_AND_PASS: + /* First pass of 2-pass quantization */ + if (post->whole_image == NULL) + ERREXIT(cinfo, JERR_BAD_BUFFER_MODE); + post->pub.post_process_data = post_process_prepass; + break; + case JBUF_CRANK_DEST: + /* Second pass of 2-pass quantization */ + if (post->whole_image == NULL) + ERREXIT(cinfo, JERR_BAD_BUFFER_MODE); + post->pub.post_process_data = post_process_2pass; + break; +#endif /* QUANT_2PASS_SUPPORTED */ + default: + ERREXIT(cinfo, JERR_BAD_BUFFER_MODE); + break; + } + post->starting_row = post->next_row = 0; +} + + +/* + * Process some data in the one-pass (strip buffer) case. + * This is used for color precision reduction as well as one-pass quantization. + */ + +METHODDEF(void) +post_process_1pass (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr, + JDIMENSION in_row_groups_avail, + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail) +{ + my_post_ptr post = (my_post_ptr) cinfo->post; + JDIMENSION num_rows, max_rows; + + /* Fill the buffer, but not more than what we can dump out in one go. */ + /* Note we rely on the upsampler to detect bottom of image. */ + max_rows = out_rows_avail - *out_row_ctr; + if (max_rows > post->strip_height) + max_rows = post->strip_height; + num_rows = 0; + (*cinfo->upsample->upsample) (cinfo, + input_buf, in_row_group_ctr, in_row_groups_avail, + post->buffer, &num_rows, max_rows); + /* Quantize and emit data. */ + (*cinfo->cquantize->color_quantize) (cinfo, + post->buffer, output_buf + *out_row_ctr, (int) num_rows); + *out_row_ctr += num_rows; +} + + +#ifdef QUANT_2PASS_SUPPORTED + +/* + * Process some data in the first pass of 2-pass quantization. + */ + +METHODDEF(void) +post_process_prepass (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr, + JDIMENSION in_row_groups_avail, + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail) +{ + my_post_ptr post = (my_post_ptr) cinfo->post; + JDIMENSION old_next_row, num_rows; + + /* Reposition virtual buffer if at start of strip. */ + if (post->next_row == 0) { + post->buffer = (*cinfo->mem->access_virt_sarray) + ((j_common_ptr) cinfo, post->whole_image, + post->starting_row, post->strip_height, TRUE); + } + + /* Upsample some data (up to a strip height's worth). */ + old_next_row = post->next_row; + (*cinfo->upsample->upsample) (cinfo, + input_buf, in_row_group_ctr, in_row_groups_avail, + post->buffer, &post->next_row, post->strip_height); + + /* Allow quantizer to scan new data. No data is emitted, */ + /* but we advance out_row_ctr so outer loop can tell when we're done. */ + if (post->next_row > old_next_row) { + num_rows = post->next_row - old_next_row; + (*cinfo->cquantize->color_quantize) (cinfo, post->buffer + old_next_row, + (JSAMPARRAY) NULL, (int) num_rows); + *out_row_ctr += num_rows; + } + + /* Advance if we filled the strip. */ + if (post->next_row >= post->strip_height) { + post->starting_row += post->strip_height; + post->next_row = 0; + } +} + + +/* + * Process some data in the second pass of 2-pass quantization. + */ + +METHODDEF(void) +post_process_2pass (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr, + JDIMENSION in_row_groups_avail, + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail) +{ + my_post_ptr post = (my_post_ptr) cinfo->post; + JDIMENSION num_rows, max_rows; + + /* Reposition virtual buffer if at start of strip. */ + if (post->next_row == 0) { + post->buffer = (*cinfo->mem->access_virt_sarray) + ((j_common_ptr) cinfo, post->whole_image, + post->starting_row, post->strip_height, FALSE); + } + + /* Determine number of rows to emit. */ + num_rows = post->strip_height - post->next_row; /* available in strip */ + max_rows = out_rows_avail - *out_row_ctr; /* available in output area */ + if (num_rows > max_rows) + num_rows = max_rows; + /* We have to check bottom of image here, can't depend on upsampler. */ + max_rows = cinfo->output_height - post->starting_row; + if (num_rows > max_rows) + num_rows = max_rows; + + /* Quantize and emit data. */ + (*cinfo->cquantize->color_quantize) (cinfo, + post->buffer + post->next_row, output_buf + *out_row_ctr, + (int) num_rows); + *out_row_ctr += num_rows; + + /* Advance if we filled the strip. */ + post->next_row += num_rows; + if (post->next_row >= post->strip_height) { + post->starting_row += post->strip_height; + post->next_row = 0; + } +} + +#endif /* QUANT_2PASS_SUPPORTED */ + + +/* + * Initialize postprocessing controller. + */ + +GLOBAL(void) +jinit_d_post_controller (j_decompress_ptr cinfo, boolean need_full_buffer) +{ + my_post_ptr post; + + post = (my_post_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(my_post_controller)); + cinfo->post = (struct jpeg_d_post_controller *) post; + post->pub.start_pass = start_pass_dpost; + post->whole_image = NULL; /* flag for no virtual arrays */ + post->buffer = NULL; /* flag for no strip buffer */ + + /* Create the quantization buffer, if needed */ + if (cinfo->quantize_colors) { + /* The buffer strip height is max_v_samp_factor, which is typically + * an efficient number of rows for upsampling to return. + * (In the presence of output rescaling, we might want to be smarter?) + */ + post->strip_height = (JDIMENSION) cinfo->max_v_samp_factor; + if (need_full_buffer) { + /* Two-pass color quantization: need full-image storage. */ + /* We round up the number of rows to a multiple of the strip height. */ +#ifdef QUANT_2PASS_SUPPORTED + post->whole_image = (*cinfo->mem->request_virt_sarray) + ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE, + cinfo->output_width * cinfo->out_color_components, + (JDIMENSION) jround_up((long) cinfo->output_height, + (long) post->strip_height), + post->strip_height); +#else + ERREXIT(cinfo, JERR_BAD_BUFFER_MODE); +#endif /* QUANT_2PASS_SUPPORTED */ + } else { + /* One-pass color quantization: just make a strip buffer. */ + post->buffer = (*cinfo->mem->alloc_sarray) + ((j_common_ptr) cinfo, JPOOL_IMAGE, + cinfo->output_width * cinfo->out_color_components, + post->strip_height); + } + } +} diff --git a/media/libjpeg/jdsample.c b/media/libjpeg/jdsample.c new file mode 100644 index 000000000..b1378e151 --- /dev/null +++ b/media/libjpeg/jdsample.c @@ -0,0 +1,517 @@ +/* + * jdsample.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1996, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2010, 2015-2016, D. R. Commander. + * Copyright (C) 2014, MIPS Technologies, Inc., California. + * Copyright (C) 2015, Google, Inc. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains upsampling routines. + * + * Upsampling input data is counted in "row groups". A row group + * is defined to be (v_samp_factor * DCT_scaled_size / min_DCT_scaled_size) + * sample rows of each component. Upsampling will normally produce + * max_v_samp_factor pixel rows from each row group (but this could vary + * if the upsampler is applying a scale factor of its own). + * + * An excellent reference for image resampling is + * Digital Image Warping, George Wolberg, 1990. + * Pub. by IEEE Computer Society Press, Los Alamitos, CA. ISBN 0-8186-8944-7. + */ + +#include "jinclude.h" +#include "jdsample.h" +#include "jsimd.h" +#include "jpegcomp.h" + + + +/* + * Initialize for an upsampling pass. + */ + +METHODDEF(void) +start_pass_upsample (j_decompress_ptr cinfo) +{ + my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample; + + /* Mark the conversion buffer empty */ + upsample->next_row_out = cinfo->max_v_samp_factor; + /* Initialize total-height counter for detecting bottom of image */ + upsample->rows_to_go = cinfo->output_height; +} + + +/* + * Control routine to do upsampling (and color conversion). + * + * In this version we upsample each component independently. + * We upsample one row group into the conversion buffer, then apply + * color conversion a row at a time. + */ + +METHODDEF(void) +sep_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr, + JDIMENSION in_row_groups_avail, + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail) +{ + my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample; + int ci; + jpeg_component_info *compptr; + JDIMENSION num_rows; + + /* Fill the conversion buffer, if it's empty */ + if (upsample->next_row_out >= cinfo->max_v_samp_factor) { + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + /* Invoke per-component upsample method. Notice we pass a POINTER + * to color_buf[ci], so that fullsize_upsample can change it. + */ + (*upsample->methods[ci]) (cinfo, compptr, + input_buf[ci] + (*in_row_group_ctr * upsample->rowgroup_height[ci]), + upsample->color_buf + ci); + } + upsample->next_row_out = 0; + } + + /* Color-convert and emit rows */ + + /* How many we have in the buffer: */ + num_rows = (JDIMENSION) (cinfo->max_v_samp_factor - upsample->next_row_out); + /* Not more than the distance to the end of the image. Need this test + * in case the image height is not a multiple of max_v_samp_factor: + */ + if (num_rows > upsample->rows_to_go) + num_rows = upsample->rows_to_go; + /* And not more than what the client can accept: */ + out_rows_avail -= *out_row_ctr; + if (num_rows > out_rows_avail) + num_rows = out_rows_avail; + + (*cinfo->cconvert->color_convert) (cinfo, upsample->color_buf, + (JDIMENSION) upsample->next_row_out, + output_buf + *out_row_ctr, + (int) num_rows); + + /* Adjust counts */ + *out_row_ctr += num_rows; + upsample->rows_to_go -= num_rows; + upsample->next_row_out += num_rows; + /* When the buffer is emptied, declare this input row group consumed */ + if (upsample->next_row_out >= cinfo->max_v_samp_factor) + (*in_row_group_ctr)++; +} + + +/* + * These are the routines invoked by sep_upsample to upsample pixel values + * of a single component. One row group is processed per call. + */ + + +/* + * For full-size components, we just make color_buf[ci] point at the + * input buffer, and thus avoid copying any data. Note that this is + * safe only because sep_upsample doesn't declare the input row group + * "consumed" until we are done color converting and emitting it. + */ + +METHODDEF(void) +fullsize_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + *output_data_ptr = input_data; +} + + +/* + * This is a no-op version used for "uninteresting" components. + * These components will not be referenced by color conversion. + */ + +METHODDEF(void) +noop_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + *output_data_ptr = NULL; /* safety check */ +} + + +/* + * This version handles any integral sampling ratios. + * This is not used for typical JPEG files, so it need not be fast. + * Nor, for that matter, is it particularly accurate: the algorithm is + * simple replication of the input pixel onto the corresponding output + * pixels. The hi-falutin sampling literature refers to this as a + * "box filter". A box filter tends to introduce visible artifacts, + * so if you are actually going to use 3:1 or 4:1 sampling ratios + * you would be well advised to improve this code. + */ + +METHODDEF(void) +int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample; + JSAMPARRAY output_data = *output_data_ptr; + register JSAMPROW inptr, outptr; + register JSAMPLE invalue; + register int h; + JSAMPROW outend; + int h_expand, v_expand; + int inrow, outrow; + + h_expand = upsample->h_expand[compptr->component_index]; + v_expand = upsample->v_expand[compptr->component_index]; + + inrow = outrow = 0; + while (outrow < cinfo->max_v_samp_factor) { + /* Generate one output row with proper horizontal expansion */ + inptr = input_data[inrow]; + outptr = output_data[outrow]; + outend = outptr + cinfo->output_width; + while (outptr < outend) { + invalue = *inptr++; /* don't need GETJSAMPLE() here */ + for (h = h_expand; h > 0; h--) { + *outptr++ = invalue; + } + } + /* Generate any additional output rows by duplicating the first one */ + if (v_expand > 1) { + jcopy_sample_rows(output_data, outrow, output_data, outrow+1, + v_expand-1, cinfo->output_width); + } + inrow++; + outrow += v_expand; + } +} + + +/* + * Fast processing for the common case of 2:1 horizontal and 1:1 vertical. + * It's still a box filter. + */ + +METHODDEF(void) +h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + register JSAMPROW inptr, outptr; + register JSAMPLE invalue; + JSAMPROW outend; + int inrow; + + for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) { + inptr = input_data[inrow]; + outptr = output_data[inrow]; + outend = outptr + cinfo->output_width; + while (outptr < outend) { + invalue = *inptr++; /* don't need GETJSAMPLE() here */ + *outptr++ = invalue; + *outptr++ = invalue; + } + } +} + + +/* + * Fast processing for the common case of 2:1 horizontal and 2:1 vertical. + * It's still a box filter. + */ + +METHODDEF(void) +h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + register JSAMPROW inptr, outptr; + register JSAMPLE invalue; + JSAMPROW outend; + int inrow, outrow; + + inrow = outrow = 0; + while (outrow < cinfo->max_v_samp_factor) { + inptr = input_data[inrow]; + outptr = output_data[outrow]; + outend = outptr + cinfo->output_width; + while (outptr < outend) { + invalue = *inptr++; /* don't need GETJSAMPLE() here */ + *outptr++ = invalue; + *outptr++ = invalue; + } + jcopy_sample_rows(output_data, outrow, output_data, outrow+1, + 1, cinfo->output_width); + inrow++; + outrow += 2; + } +} + + +/* + * Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. + * + * The upsampling algorithm is linear interpolation between pixel centers, + * also known as a "triangle filter". This is a good compromise between + * speed and visual quality. The centers of the output pixels are 1/4 and 3/4 + * of the way between input pixel centers. + * + * A note about the "bias" calculations: when rounding fractional values to + * integer, we do not want to always round 0.5 up to the next integer. + * If we did that, we'd introduce a noticeable bias towards larger values. + * Instead, this code is arranged so that 0.5 will be rounded up or down at + * alternate pixel locations (a simple ordered dither pattern). + */ + +METHODDEF(void) +h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + register JSAMPROW inptr, outptr; + register int invalue; + register JDIMENSION colctr; + int inrow; + + for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) { + inptr = input_data[inrow]; + outptr = output_data[inrow]; + /* Special case for first column */ + invalue = GETJSAMPLE(*inptr++); + *outptr++ = (JSAMPLE) invalue; + *outptr++ = (JSAMPLE) ((invalue * 3 + GETJSAMPLE(*inptr) + 2) >> 2); + + for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) { + /* General case: 3/4 * nearer pixel + 1/4 * further pixel */ + invalue = GETJSAMPLE(*inptr++) * 3; + *outptr++ = (JSAMPLE) ((invalue + GETJSAMPLE(inptr[-2]) + 1) >> 2); + *outptr++ = (JSAMPLE) ((invalue + GETJSAMPLE(*inptr) + 2) >> 2); + } + + /* Special case for last column */ + invalue = GETJSAMPLE(*inptr); + *outptr++ = (JSAMPLE) ((invalue * 3 + GETJSAMPLE(inptr[-1]) + 1) >> 2); + *outptr++ = (JSAMPLE) invalue; + } +} + + +/* + * Fancy processing for 1:1 horizontal and 2:1 vertical (4:4:0 subsampling). + * + * This is a less common case, but it can be encountered when losslessly + * rotating/transposing a JPEG file that uses 4:2:2 chroma subsampling. + */ + +METHODDEF(void) +h1v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr0, inptr1, outptr; +#if BITS_IN_JSAMPLE == 8 + int thiscolsum; +#else + JLONG thiscolsum; +#endif + JDIMENSION colctr; + int inrow, outrow, v; + + inrow = outrow = 0; + while (outrow < cinfo->max_v_samp_factor) { + for (v = 0; v < 2; v++) { + /* inptr0 points to nearest input row, inptr1 points to next nearest */ + inptr0 = input_data[inrow]; + if (v == 0) /* next nearest is row above */ + inptr1 = input_data[inrow-1]; + else /* next nearest is row below */ + inptr1 = input_data[inrow+1]; + outptr = output_data[outrow++]; + + for(colctr = 0; colctr < compptr->downsampled_width; colctr++) { + thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++); + *outptr++ = (JSAMPLE) ((thiscolsum + 1) >> 2); + } + } + inrow++; + } +} + + +/* + * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. + * Again a triangle filter; see comments for h2v1 case, above. + * + * It is OK for us to reference the adjacent input rows because we demanded + * context from the main buffer controller (see initialization code). + */ + +METHODDEF(void) +h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + register JSAMPROW inptr0, inptr1, outptr; +#if BITS_IN_JSAMPLE == 8 + register int thiscolsum, lastcolsum, nextcolsum; +#else + register JLONG thiscolsum, lastcolsum, nextcolsum; +#endif + register JDIMENSION colctr; + int inrow, outrow, v; + + inrow = outrow = 0; + while (outrow < cinfo->max_v_samp_factor) { + for (v = 0; v < 2; v++) { + /* inptr0 points to nearest input row, inptr1 points to next nearest */ + inptr0 = input_data[inrow]; + if (v == 0) /* next nearest is row above */ + inptr1 = input_data[inrow-1]; + else /* next nearest is row below */ + inptr1 = input_data[inrow+1]; + outptr = output_data[outrow++]; + + /* Special case for first column */ + thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++); + nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++); + *outptr++ = (JSAMPLE) ((thiscolsum * 4 + 8) >> 4); + *outptr++ = (JSAMPLE) ((thiscolsum * 3 + nextcolsum + 7) >> 4); + lastcolsum = thiscolsum; thiscolsum = nextcolsum; + + for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) { + /* General case: 3/4 * nearer pixel + 1/4 * further pixel in each */ + /* dimension, thus 9/16, 3/16, 3/16, 1/16 overall */ + nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++); + *outptr++ = (JSAMPLE) ((thiscolsum * 3 + lastcolsum + 8) >> 4); + *outptr++ = (JSAMPLE) ((thiscolsum * 3 + nextcolsum + 7) >> 4); + lastcolsum = thiscolsum; thiscolsum = nextcolsum; + } + + /* Special case for last column */ + *outptr++ = (JSAMPLE) ((thiscolsum * 3 + lastcolsum + 8) >> 4); + *outptr++ = (JSAMPLE) ((thiscolsum * 4 + 7) >> 4); + } + inrow++; + } +} + + +/* + * Module initialization routine for upsampling. + */ + +GLOBAL(void) +jinit_upsampler (j_decompress_ptr cinfo) +{ + my_upsample_ptr upsample; + int ci; + jpeg_component_info *compptr; + boolean need_buffer, do_fancy; + int h_in_group, v_in_group, h_out_group, v_out_group; + + if (!cinfo->master->jinit_upsampler_no_alloc) { + upsample = (my_upsample_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(my_upsampler)); + cinfo->upsample = (struct jpeg_upsampler *) upsample; + upsample->pub.start_pass = start_pass_upsample; + upsample->pub.upsample = sep_upsample; + upsample->pub.need_context_rows = FALSE; /* until we find out differently */ + } else + upsample = (my_upsample_ptr) cinfo->upsample; + + if (cinfo->CCIR601_sampling) /* this isn't supported */ + ERREXIT(cinfo, JERR_CCIR601_NOTIMPL); + + /* jdmainct.c doesn't support context rows when min_DCT_scaled_size = 1, + * so don't ask for it. + */ + do_fancy = cinfo->do_fancy_upsampling && cinfo->_min_DCT_scaled_size > 1; + + /* Verify we can handle the sampling factors, select per-component methods, + * and create storage as needed. + */ + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; + ci++, compptr++) { + /* Compute size of an "input group" after IDCT scaling. This many samples + * are to be converted to max_h_samp_factor * max_v_samp_factor pixels. + */ + h_in_group = (compptr->h_samp_factor * compptr->_DCT_scaled_size) / + cinfo->_min_DCT_scaled_size; + v_in_group = (compptr->v_samp_factor * compptr->_DCT_scaled_size) / + cinfo->_min_DCT_scaled_size; + h_out_group = cinfo->max_h_samp_factor; + v_out_group = cinfo->max_v_samp_factor; + upsample->rowgroup_height[ci] = v_in_group; /* save for use later */ + need_buffer = TRUE; + if (! compptr->component_needed) { + /* Don't bother to upsample an uninteresting component. */ + upsample->methods[ci] = noop_upsample; + need_buffer = FALSE; + } else if (h_in_group == h_out_group && v_in_group == v_out_group) { + /* Fullsize components can be processed without any work. */ + upsample->methods[ci] = fullsize_upsample; + need_buffer = FALSE; + } else if (h_in_group * 2 == h_out_group && + v_in_group == v_out_group) { + /* Special cases for 2h1v upsampling */ + if (do_fancy && compptr->downsampled_width > 2) { + if (jsimd_can_h2v1_fancy_upsample()) + upsample->methods[ci] = jsimd_h2v1_fancy_upsample; + else + upsample->methods[ci] = h2v1_fancy_upsample; + } else { + if (jsimd_can_h2v1_upsample()) + upsample->methods[ci] = jsimd_h2v1_upsample; + else + upsample->methods[ci] = h2v1_upsample; + } + } else if (h_in_group == h_out_group && + v_in_group * 2 == v_out_group && do_fancy) { + /* Non-fancy upsampling is handled by the generic method */ + upsample->methods[ci] = h1v2_fancy_upsample; + upsample->pub.need_context_rows = TRUE; + } else if (h_in_group * 2 == h_out_group && + v_in_group * 2 == v_out_group) { + /* Special cases for 2h2v upsampling */ + if (do_fancy && compptr->downsampled_width > 2) { + if (jsimd_can_h2v2_fancy_upsample()) + upsample->methods[ci] = jsimd_h2v2_fancy_upsample; + else + upsample->methods[ci] = h2v2_fancy_upsample; + upsample->pub.need_context_rows = TRUE; + } else { + if (jsimd_can_h2v2_upsample()) + upsample->methods[ci] = jsimd_h2v2_upsample; + else + upsample->methods[ci] = h2v2_upsample; + } + } else if ((h_out_group % h_in_group) == 0 && + (v_out_group % v_in_group) == 0) { + /* Generic integral-factors upsampling method */ +#if defined(__mips__) + if (jsimd_can_int_upsample()) + upsample->methods[ci] = jsimd_int_upsample; + else +#endif + upsample->methods[ci] = int_upsample; + upsample->h_expand[ci] = (UINT8) (h_out_group / h_in_group); + upsample->v_expand[ci] = (UINT8) (v_out_group / v_in_group); + } else + ERREXIT(cinfo, JERR_FRACT_SAMPLE_NOTIMPL); + if (need_buffer && !cinfo->master->jinit_upsampler_no_alloc) { + upsample->color_buf[ci] = (*cinfo->mem->alloc_sarray) + ((j_common_ptr) cinfo, JPOOL_IMAGE, + (JDIMENSION) jround_up((long) cinfo->output_width, + (long) cinfo->max_h_samp_factor), + (JDIMENSION) cinfo->max_v_samp_factor); + } + } +} diff --git a/media/libjpeg/jdsample.h b/media/libjpeg/jdsample.h new file mode 100644 index 000000000..a6bf08a03 --- /dev/null +++ b/media/libjpeg/jdsample.h @@ -0,0 +1,50 @@ +/* + * jdsample.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1996, Thomas G. Lane. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + */ + +#define JPEG_INTERNALS +#include "jpeglib.h" + + +/* Pointer to routine to upsample a single component */ +typedef void (*upsample1_ptr) (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +/* Private subobject */ + +typedef struct { + struct jpeg_upsampler pub; /* public fields */ + + /* Color conversion buffer. When using separate upsampling and color + * conversion steps, this buffer holds one upsampled row group until it + * has been color converted and output. + * Note: we do not allocate any storage for component(s) which are full-size, + * ie do not need rescaling. The corresponding entry of color_buf[] is + * simply set to point to the input data array, thereby avoiding copying. + */ + JSAMPARRAY color_buf[MAX_COMPONENTS]; + + /* Per-component upsampling method pointers */ + upsample1_ptr methods[MAX_COMPONENTS]; + + int next_row_out; /* counts rows emitted from color_buf */ + JDIMENSION rows_to_go; /* counts rows remaining in image */ + + /* Height of an input row group for each component. */ + int rowgroup_height[MAX_COMPONENTS]; + + /* These arrays save pixel expansion factors so that int_expand need not + * recompute them each time. They are unused for other upsampling methods. + */ + UINT8 h_expand[MAX_COMPONENTS]; + UINT8 v_expand[MAX_COMPONENTS]; +} my_upsampler; + +typedef my_upsampler *my_upsample_ptr; diff --git a/media/libjpeg/jdtrans.c b/media/libjpeg/jdtrans.c new file mode 100644 index 000000000..cfc85dd24 --- /dev/null +++ b/media/libjpeg/jdtrans.c @@ -0,0 +1,155 @@ +/* + * jdtrans.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1995-1997, Thomas G. Lane. + * It was modified by The libjpeg-turbo Project to include only code relevant + * to libjpeg-turbo. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains library routines for transcoding decompression, + * that is, reading raw DCT coefficient arrays from an input JPEG file. + * The routines in jdapimin.c will also be needed by a transcoder. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" + + +/* Forward declarations */ +LOCAL(void) transdecode_master_selection (j_decompress_ptr cinfo); + + +/* + * Read the coefficient arrays from a JPEG file. + * jpeg_read_header must be completed before calling this. + * + * The entire image is read into a set of virtual coefficient-block arrays, + * one per component. The return value is a pointer to the array of + * virtual-array descriptors. These can be manipulated directly via the + * JPEG memory manager, or handed off to jpeg_write_coefficients(). + * To release the memory occupied by the virtual arrays, call + * jpeg_finish_decompress() when done with the data. + * + * An alternative usage is to simply obtain access to the coefficient arrays + * during a buffered-image-mode decompression operation. This is allowed + * after any jpeg_finish_output() call. The arrays can be accessed until + * jpeg_finish_decompress() is called. (Note that any call to the library + * may reposition the arrays, so don't rely on access_virt_barray() results + * to stay valid across library calls.) + * + * Returns NULL if suspended. This case need be checked only if + * a suspending data source is used. + */ + +GLOBAL(jvirt_barray_ptr *) +jpeg_read_coefficients (j_decompress_ptr cinfo) +{ + if (cinfo->global_state == DSTATE_READY) { + /* First call: initialize active modules */ + transdecode_master_selection(cinfo); + cinfo->global_state = DSTATE_RDCOEFS; + } + if (cinfo->global_state == DSTATE_RDCOEFS) { + /* Absorb whole file into the coef buffer */ + for (;;) { + int retcode; + /* Call progress monitor hook if present */ + if (cinfo->progress != NULL) + (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo); + /* Absorb some more input */ + retcode = (*cinfo->inputctl->consume_input) (cinfo); + if (retcode == JPEG_SUSPENDED) + return NULL; + if (retcode == JPEG_REACHED_EOI) + break; + /* Advance progress counter if appropriate */ + if (cinfo->progress != NULL && + (retcode == JPEG_ROW_COMPLETED || retcode == JPEG_REACHED_SOS)) { + if (++cinfo->progress->pass_counter >= cinfo->progress->pass_limit) { + /* startup underestimated number of scans; ratchet up one scan */ + cinfo->progress->pass_limit += (long) cinfo->total_iMCU_rows; + } + } + } + /* Set state so that jpeg_finish_decompress does the right thing */ + cinfo->global_state = DSTATE_STOPPING; + } + /* At this point we should be in state DSTATE_STOPPING if being used + * standalone, or in state DSTATE_BUFIMAGE if being invoked to get access + * to the coefficients during a full buffered-image-mode decompression. + */ + if ((cinfo->global_state == DSTATE_STOPPING || + cinfo->global_state == DSTATE_BUFIMAGE) && cinfo->buffered_image) { + return cinfo->coef->coef_arrays; + } + /* Oops, improper usage */ + ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); + return NULL; /* keep compiler happy */ +} + + +/* + * Master selection of decompression modules for transcoding. + * This substitutes for jdmaster.c's initialization of the full decompressor. + */ + +LOCAL(void) +transdecode_master_selection (j_decompress_ptr cinfo) +{ + /* This is effectively a buffered-image operation. */ + cinfo->buffered_image = TRUE; + +#if JPEG_LIB_VERSION >= 80 + /* Compute output image dimensions and related values. */ + jpeg_core_output_dimensions(cinfo); +#endif + + /* Entropy decoding: either Huffman or arithmetic coding. */ + if (cinfo->arith_code) { +#ifdef D_ARITH_CODING_SUPPORTED + jinit_arith_decoder(cinfo); +#else + ERREXIT(cinfo, JERR_ARITH_NOTIMPL); +#endif + } else { + if (cinfo->progressive_mode) { +#ifdef D_PROGRESSIVE_SUPPORTED + jinit_phuff_decoder(cinfo); +#else + ERREXIT(cinfo, JERR_NOT_COMPILED); +#endif + } else + jinit_huff_decoder(cinfo); + } + + /* Always get a full-image coefficient buffer. */ + jinit_d_coef_controller(cinfo, TRUE); + + /* We can now tell the memory manager to allocate virtual arrays. */ + (*cinfo->mem->realize_virt_arrays) ((j_common_ptr) cinfo); + + /* Initialize input side of decompressor to consume first scan. */ + (*cinfo->inputctl->start_input_pass) (cinfo); + + /* Initialize progress monitoring. */ + if (cinfo->progress != NULL) { + int nscans; + /* Estimate number of scans to set pass_limit. */ + if (cinfo->progressive_mode) { + /* Arbitrarily estimate 2 interleaved DC scans + 3 AC scans/component. */ + nscans = 2 + 3 * cinfo->num_components; + } else if (cinfo->inputctl->has_multiple_scans) { + /* For a nonprogressive multiscan file, estimate 1 scan per component. */ + nscans = cinfo->num_components; + } else { + nscans = 1; + } + cinfo->progress->pass_counter = 0L; + cinfo->progress->pass_limit = (long) cinfo->total_iMCU_rows * nscans; + cinfo->progress->completed_passes = 0; + cinfo->progress->total_passes = 1; + } +} diff --git a/media/libjpeg/jerror.c b/media/libjpeg/jerror.c new file mode 100644 index 000000000..c31acd9ef --- /dev/null +++ b/media/libjpeg/jerror.c @@ -0,0 +1,251 @@ +/* + * jerror.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1998, Thomas G. Lane. + * It was modified by The libjpeg-turbo Project to include only code relevant + * to libjpeg-turbo. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains simple error-reporting and trace-message routines. + * These are suitable for Unix-like systems and others where writing to + * stderr is the right thing to do. Many applications will want to replace + * some or all of these routines. + * + * If you define USE_WINDOWS_MESSAGEBOX in jconfig.h or in the makefile, + * you get a Windows-specific hack to display error messages in a dialog box. + * It ain't much, but it beats dropping error messages into the bit bucket, + * which is what happens to output to stderr under most Windows C compilers. + * + * These routines are used by both the compression and decompression code. + */ + +/* this is not a core library module, so it doesn't define JPEG_INTERNALS */ +#include "jinclude.h" +#include "jpeglib.h" +#include "jversion.h" +#include "jerror.h" + +#ifdef USE_WINDOWS_MESSAGEBOX +#include <windows.h> +#endif + +#ifndef EXIT_FAILURE /* define exit() codes if not provided */ +#define EXIT_FAILURE 1 +#endif + + +/* + * Create the message string table. + * We do this from the master message list in jerror.h by re-reading + * jerror.h with a suitable definition for macro JMESSAGE. + * The message table is made an external symbol just in case any applications + * want to refer to it directly. + */ + +#define JMESSAGE(code,string) string , + +const char * const jpeg_std_message_table[] = { +#include "jerror.h" + NULL +}; + + +/* + * Error exit handler: must not return to caller. + * + * Applications may override this if they want to get control back after + * an error. Typically one would longjmp somewhere instead of exiting. + * The setjmp buffer can be made a private field within an expanded error + * handler object. Note that the info needed to generate an error message + * is stored in the error object, so you can generate the message now or + * later, at your convenience. + * You should make sure that the JPEG object is cleaned up (with jpeg_abort + * or jpeg_destroy) at some point. + */ + +METHODDEF(void) +error_exit (j_common_ptr cinfo) +{ + /* Always display the message */ + (*cinfo->err->output_message) (cinfo); + + /* Let the memory manager delete any temp files before we die */ + jpeg_destroy(cinfo); + + exit(EXIT_FAILURE); +} + + +/* + * Actual output of an error or trace message. + * Applications may override this method to send JPEG messages somewhere + * other than stderr. + * + * On Windows, printing to stderr is generally completely useless, + * so we provide optional code to produce an error-dialog popup. + * Most Windows applications will still prefer to override this routine, + * but if they don't, it'll do something at least marginally useful. + * + * NOTE: to use the library in an environment that doesn't support the + * C stdio library, you may have to delete the call to fprintf() entirely, + * not just not use this routine. + */ + +METHODDEF(void) +output_message (j_common_ptr cinfo) +{ + char buffer[JMSG_LENGTH_MAX]; + + /* Create the message */ + (*cinfo->err->format_message) (cinfo, buffer); + +#ifdef USE_WINDOWS_MESSAGEBOX + /* Display it in a message dialog box */ + MessageBox(GetActiveWindow(), buffer, "JPEG Library Error", + MB_OK | MB_ICONERROR); +#else + /* Send it to stderr, adding a newline */ + fprintf(stderr, "%s\n", buffer); +#endif +} + + +/* + * Decide whether to emit a trace or warning message. + * msg_level is one of: + * -1: recoverable corrupt-data warning, may want to abort. + * 0: important advisory messages (always display to user). + * 1: first level of tracing detail. + * 2,3,...: successively more detailed tracing messages. + * An application might override this method if it wanted to abort on warnings + * or change the policy about which messages to display. + */ + +METHODDEF(void) +emit_message (j_common_ptr cinfo, int msg_level) +{ + struct jpeg_error_mgr *err = cinfo->err; + + if (msg_level < 0) { + /* It's a warning message. Since corrupt files may generate many warnings, + * the policy implemented here is to show only the first warning, + * unless trace_level >= 3. + */ + if (err->num_warnings == 0 || err->trace_level >= 3) + (*err->output_message) (cinfo); + /* Always count warnings in num_warnings. */ + err->num_warnings++; + } else { + /* It's a trace message. Show it if trace_level >= msg_level. */ + if (err->trace_level >= msg_level) + (*err->output_message) (cinfo); + } +} + + +/* + * Format a message string for the most recent JPEG error or message. + * The message is stored into buffer, which should be at least JMSG_LENGTH_MAX + * characters. Note that no '\n' character is added to the string. + * Few applications should need to override this method. + */ + +METHODDEF(void) +format_message (j_common_ptr cinfo, char *buffer) +{ + struct jpeg_error_mgr *err = cinfo->err; + int msg_code = err->msg_code; + const char *msgtext = NULL; + const char *msgptr; + char ch; + boolean isstring; + + /* Look up message string in proper table */ + if (msg_code > 0 && msg_code <= err->last_jpeg_message) { + msgtext = err->jpeg_message_table[msg_code]; + } else if (err->addon_message_table != NULL && + msg_code >= err->first_addon_message && + msg_code <= err->last_addon_message) { + msgtext = err->addon_message_table[msg_code - err->first_addon_message]; + } + + /* Defend against bogus message number */ + if (msgtext == NULL) { + err->msg_parm.i[0] = msg_code; + msgtext = err->jpeg_message_table[0]; + } + + /* Check for string parameter, as indicated by %s in the message text */ + isstring = FALSE; + msgptr = msgtext; + while ((ch = *msgptr++) != '\0') { + if (ch == '%') { + if (*msgptr == 's') isstring = TRUE; + break; + } + } + + /* Format the message into the passed buffer */ + if (isstring) + sprintf(buffer, msgtext, err->msg_parm.s); + else + sprintf(buffer, msgtext, + err->msg_parm.i[0], err->msg_parm.i[1], + err->msg_parm.i[2], err->msg_parm.i[3], + err->msg_parm.i[4], err->msg_parm.i[5], + err->msg_parm.i[6], err->msg_parm.i[7]); +} + + +/* + * Reset error state variables at start of a new image. + * This is called during compression startup to reset trace/error + * processing to default state, without losing any application-specific + * method pointers. An application might possibly want to override + * this method if it has additional error processing state. + */ + +METHODDEF(void) +reset_error_mgr (j_common_ptr cinfo) +{ + cinfo->err->num_warnings = 0; + /* trace_level is not reset since it is an application-supplied parameter */ + cinfo->err->msg_code = 0; /* may be useful as a flag for "no error" */ +} + + +/* + * Fill in the standard error-handling methods in a jpeg_error_mgr object. + * Typical call is: + * struct jpeg_compress_struct cinfo; + * struct jpeg_error_mgr err; + * + * cinfo.err = jpeg_std_error(&err); + * after which the application may override some of the methods. + */ + +GLOBAL(struct jpeg_error_mgr *) +jpeg_std_error (struct jpeg_error_mgr *err) +{ + err->error_exit = error_exit; + err->emit_message = emit_message; + err->output_message = output_message; + err->format_message = format_message; + err->reset_error_mgr = reset_error_mgr; + + err->trace_level = 0; /* default = no tracing */ + err->num_warnings = 0; /* no warnings emitted yet */ + err->msg_code = 0; /* may be useful as a flag for "no error" */ + + /* Initialize message table pointers */ + err->jpeg_message_table = jpeg_std_message_table; + err->last_jpeg_message = (int) JMSG_LASTMSGCODE - 1; + + err->addon_message_table = NULL; + err->first_addon_message = 0; /* for safety */ + err->last_addon_message = 0; + + return err; +} diff --git a/media/libjpeg/jerror.h b/media/libjpeg/jerror.h new file mode 100644 index 000000000..11a07cb5d --- /dev/null +++ b/media/libjpeg/jerror.h @@ -0,0 +1,317 @@ +/* + * jerror.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1994-1997, Thomas G. Lane. + * Modified 1997-2009 by Guido Vollbeding. + * libjpeg-turbo Modifications: + * Copyright (C) 2014, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file defines the error and message codes for the JPEG library. + * Edit this file to add new codes, or to translate the message strings to + * some other language. + * A set of error-reporting macros are defined too. Some applications using + * the JPEG library may wish to include this file to get the error codes + * and/or the macros. + */ + +/* + * To define the enum list of message codes, include this file without + * defining macro JMESSAGE. To create a message string table, include it + * again with a suitable JMESSAGE definition (see jerror.c for an example). + */ +#ifndef JMESSAGE +#ifndef JERROR_H +/* First time through, define the enum list */ +#define JMAKE_ENUM_LIST +#else +/* Repeated inclusions of this file are no-ops unless JMESSAGE is defined */ +#define JMESSAGE(code,string) +#endif /* JERROR_H */ +#endif /* JMESSAGE */ + +#ifdef JMAKE_ENUM_LIST + +typedef enum { + +#define JMESSAGE(code,string) code , + +#endif /* JMAKE_ENUM_LIST */ + +JMESSAGE(JMSG_NOMESSAGE, "Bogus message code %d") /* Must be first entry! */ + +/* For maintenance convenience, list is alphabetical by message code name */ +#if JPEG_LIB_VERSION < 70 +JMESSAGE(JERR_ARITH_NOTIMPL, + "Sorry, arithmetic coding is not implemented") +#endif +JMESSAGE(JERR_BAD_ALIGN_TYPE, "ALIGN_TYPE is wrong, please fix") +JMESSAGE(JERR_BAD_ALLOC_CHUNK, "MAX_ALLOC_CHUNK is wrong, please fix") +JMESSAGE(JERR_BAD_BUFFER_MODE, "Bogus buffer control mode") +JMESSAGE(JERR_BAD_COMPONENT_ID, "Invalid component ID %d in SOS") +#if JPEG_LIB_VERSION >= 70 +JMESSAGE(JERR_BAD_CROP_SPEC, "Invalid crop request") +#endif +JMESSAGE(JERR_BAD_DCT_COEF, "DCT coefficient out of range") +JMESSAGE(JERR_BAD_DCTSIZE, "IDCT output block size %d not supported") +#if JPEG_LIB_VERSION >= 70 +JMESSAGE(JERR_BAD_DROP_SAMPLING, + "Component index %d: mismatching sampling ratio %d:%d, %d:%d, %c") +#endif +JMESSAGE(JERR_BAD_HUFF_TABLE, "Bogus Huffman table definition") +JMESSAGE(JERR_BAD_IN_COLORSPACE, "Bogus input colorspace") +JMESSAGE(JERR_BAD_J_COLORSPACE, "Bogus JPEG colorspace") +JMESSAGE(JERR_BAD_LENGTH, "Bogus marker length") +JMESSAGE(JERR_BAD_LIB_VERSION, + "Wrong JPEG library version: library is %d, caller expects %d") +JMESSAGE(JERR_BAD_MCU_SIZE, "Sampling factors too large for interleaved scan") +JMESSAGE(JERR_BAD_POOL_ID, "Invalid memory pool code %d") +JMESSAGE(JERR_BAD_PRECISION, "Unsupported JPEG data precision %d") +JMESSAGE(JERR_BAD_PROGRESSION, + "Invalid progressive parameters Ss=%d Se=%d Ah=%d Al=%d") +JMESSAGE(JERR_BAD_PROG_SCRIPT, + "Invalid progressive parameters at scan script entry %d") +JMESSAGE(JERR_BAD_SAMPLING, "Bogus sampling factors") +JMESSAGE(JERR_BAD_SCAN_SCRIPT, "Invalid scan script at entry %d") +JMESSAGE(JERR_BAD_STATE, "Improper call to JPEG library in state %d") +JMESSAGE(JERR_BAD_STRUCT_SIZE, + "JPEG parameter struct mismatch: library thinks size is %u, caller expects %u") +JMESSAGE(JERR_BAD_VIRTUAL_ACCESS, "Bogus virtual array access") +JMESSAGE(JERR_BUFFER_SIZE, "Buffer passed to JPEG library is too small") +JMESSAGE(JERR_CANT_SUSPEND, "Suspension not allowed here") +JMESSAGE(JERR_CCIR601_NOTIMPL, "CCIR601 sampling not implemented yet") +JMESSAGE(JERR_COMPONENT_COUNT, "Too many color components: %d, max %d") +JMESSAGE(JERR_CONVERSION_NOTIMPL, "Unsupported color conversion request") +JMESSAGE(JERR_DAC_INDEX, "Bogus DAC index %d") +JMESSAGE(JERR_DAC_VALUE, "Bogus DAC value 0x%x") +JMESSAGE(JERR_DHT_INDEX, "Bogus DHT index %d") +JMESSAGE(JERR_DQT_INDEX, "Bogus DQT index %d") +JMESSAGE(JERR_EMPTY_IMAGE, "Empty JPEG image (DNL not supported)") +JMESSAGE(JERR_EMS_READ, "Read from EMS failed") +JMESSAGE(JERR_EMS_WRITE, "Write to EMS failed") +JMESSAGE(JERR_EOI_EXPECTED, "Didn't expect more than one scan") +JMESSAGE(JERR_FILE_READ, "Input file read error") +JMESSAGE(JERR_FILE_WRITE, "Output file write error --- out of disk space?") +JMESSAGE(JERR_FRACT_SAMPLE_NOTIMPL, "Fractional sampling not implemented yet") +JMESSAGE(JERR_HUFF_CLEN_OVERFLOW, "Huffman code size table overflow") +JMESSAGE(JERR_HUFF_MISSING_CODE, "Missing Huffman code table entry") +JMESSAGE(JERR_IMAGE_TOO_BIG, "Maximum supported image dimension is %u pixels") +JMESSAGE(JERR_INPUT_EMPTY, "Empty input file") +JMESSAGE(JERR_INPUT_EOF, "Premature end of input file") +JMESSAGE(JERR_MISMATCHED_QUANT_TABLE, + "Cannot transcode due to multiple use of quantization table %d") +JMESSAGE(JERR_MISSING_DATA, "Scan script does not transmit all data") +JMESSAGE(JERR_MODE_CHANGE, "Invalid color quantization mode change") +JMESSAGE(JERR_NOTIMPL, "Not implemented yet") +JMESSAGE(JERR_NOT_COMPILED, "Requested feature was omitted at compile time") +#if JPEG_LIB_VERSION >= 70 +JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined") +#endif +JMESSAGE(JERR_NO_BACKING_STORE, "Backing store not supported") +JMESSAGE(JERR_NO_HUFF_TABLE, "Huffman table 0x%02x was not defined") +JMESSAGE(JERR_NO_IMAGE, "JPEG datastream contains no image") +JMESSAGE(JERR_NO_QUANT_TABLE, "Quantization table 0x%02x was not defined") +JMESSAGE(JERR_NO_SOI, "Not a JPEG file: starts with 0x%02x 0x%02x") +JMESSAGE(JERR_OUT_OF_MEMORY, "Insufficient memory (case %d)") +JMESSAGE(JERR_QUANT_COMPONENTS, + "Cannot quantize more than %d color components") +JMESSAGE(JERR_QUANT_FEW_COLORS, "Cannot quantize to fewer than %d colors") +JMESSAGE(JERR_QUANT_MANY_COLORS, "Cannot quantize to more than %d colors") +JMESSAGE(JERR_SOF_DUPLICATE, "Invalid JPEG file structure: two SOF markers") +JMESSAGE(JERR_SOF_NO_SOS, "Invalid JPEG file structure: missing SOS marker") +JMESSAGE(JERR_SOF_UNSUPPORTED, "Unsupported JPEG process: SOF type 0x%02x") +JMESSAGE(JERR_SOI_DUPLICATE, "Invalid JPEG file structure: two SOI markers") +JMESSAGE(JERR_SOS_NO_SOF, "Invalid JPEG file structure: SOS before SOF") +JMESSAGE(JERR_TFILE_CREATE, "Failed to create temporary file %s") +JMESSAGE(JERR_TFILE_READ, "Read failed on temporary file") +JMESSAGE(JERR_TFILE_SEEK, "Seek failed on temporary file") +JMESSAGE(JERR_TFILE_WRITE, + "Write failed on temporary file --- out of disk space?") +JMESSAGE(JERR_TOO_LITTLE_DATA, "Application transferred too few scanlines") +JMESSAGE(JERR_UNKNOWN_MARKER, "Unsupported marker type 0x%02x") +JMESSAGE(JERR_VIRTUAL_BUG, "Virtual array controller messed up") +JMESSAGE(JERR_WIDTH_OVERFLOW, "Image too wide for this implementation") +JMESSAGE(JERR_XMS_READ, "Read from XMS failed") +JMESSAGE(JERR_XMS_WRITE, "Write to XMS failed") +JMESSAGE(JMSG_COPYRIGHT, JCOPYRIGHT_SHORT) +JMESSAGE(JMSG_VERSION, JVERSION) +JMESSAGE(JTRC_16BIT_TABLES, + "Caution: quantization tables are too coarse for baseline JPEG") +JMESSAGE(JTRC_ADOBE, + "Adobe APP14 marker: version %d, flags 0x%04x 0x%04x, transform %d") +JMESSAGE(JTRC_APP0, "Unknown APP0 marker (not JFIF), length %u") +JMESSAGE(JTRC_APP14, "Unknown APP14 marker (not Adobe), length %u") +JMESSAGE(JTRC_DAC, "Define Arithmetic Table 0x%02x: 0x%02x") +JMESSAGE(JTRC_DHT, "Define Huffman Table 0x%02x") +JMESSAGE(JTRC_DQT, "Define Quantization Table %d precision %d") +JMESSAGE(JTRC_DRI, "Define Restart Interval %u") +JMESSAGE(JTRC_EMS_CLOSE, "Freed EMS handle %u") +JMESSAGE(JTRC_EMS_OPEN, "Obtained EMS handle %u") +JMESSAGE(JTRC_EOI, "End Of Image") +JMESSAGE(JTRC_HUFFBITS, " %3d %3d %3d %3d %3d %3d %3d %3d") +JMESSAGE(JTRC_JFIF, "JFIF APP0 marker: version %d.%02d, density %dx%d %d") +JMESSAGE(JTRC_JFIF_BADTHUMBNAILSIZE, + "Warning: thumbnail image size does not match data length %u") +JMESSAGE(JTRC_JFIF_EXTENSION, + "JFIF extension marker: type 0x%02x, length %u") +JMESSAGE(JTRC_JFIF_THUMBNAIL, " with %d x %d thumbnail image") +JMESSAGE(JTRC_MISC_MARKER, "Miscellaneous marker 0x%02x, length %u") +JMESSAGE(JTRC_PARMLESS_MARKER, "Unexpected marker 0x%02x") +JMESSAGE(JTRC_QUANTVALS, " %4u %4u %4u %4u %4u %4u %4u %4u") +JMESSAGE(JTRC_QUANT_3_NCOLORS, "Quantizing to %d = %d*%d*%d colors") +JMESSAGE(JTRC_QUANT_NCOLORS, "Quantizing to %d colors") +JMESSAGE(JTRC_QUANT_SELECTED, "Selected %d colors for quantization") +JMESSAGE(JTRC_RECOVERY_ACTION, "At marker 0x%02x, recovery action %d") +JMESSAGE(JTRC_RST, "RST%d") +JMESSAGE(JTRC_SMOOTH_NOTIMPL, + "Smoothing not supported with nonstandard sampling ratios") +JMESSAGE(JTRC_SOF, "Start Of Frame 0x%02x: width=%u, height=%u, components=%d") +JMESSAGE(JTRC_SOF_COMPONENT, " Component %d: %dhx%dv q=%d") +JMESSAGE(JTRC_SOI, "Start of Image") +JMESSAGE(JTRC_SOS, "Start Of Scan: %d components") +JMESSAGE(JTRC_SOS_COMPONENT, " Component %d: dc=%d ac=%d") +JMESSAGE(JTRC_SOS_PARAMS, " Ss=%d, Se=%d, Ah=%d, Al=%d") +JMESSAGE(JTRC_TFILE_CLOSE, "Closed temporary file %s") +JMESSAGE(JTRC_TFILE_OPEN, "Opened temporary file %s") +JMESSAGE(JTRC_THUMB_JPEG, + "JFIF extension marker: JPEG-compressed thumbnail image, length %u") +JMESSAGE(JTRC_THUMB_PALETTE, + "JFIF extension marker: palette thumbnail image, length %u") +JMESSAGE(JTRC_THUMB_RGB, + "JFIF extension marker: RGB thumbnail image, length %u") +JMESSAGE(JTRC_UNKNOWN_IDS, + "Unrecognized component IDs %d %d %d, assuming YCbCr") +JMESSAGE(JTRC_XMS_CLOSE, "Freed XMS handle %u") +JMESSAGE(JTRC_XMS_OPEN, "Obtained XMS handle %u") +JMESSAGE(JWRN_ADOBE_XFORM, "Unknown Adobe color transform code %d") +#if JPEG_LIB_VERSION >= 70 +JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code") +#endif +JMESSAGE(JWRN_BOGUS_PROGRESSION, + "Inconsistent progression sequence for component %d coefficient %d") +JMESSAGE(JWRN_EXTRANEOUS_DATA, + "Corrupt JPEG data: %u extraneous bytes before marker 0x%02x") +JMESSAGE(JWRN_HIT_MARKER, "Corrupt JPEG data: premature end of data segment") +JMESSAGE(JWRN_HUFF_BAD_CODE, "Corrupt JPEG data: bad Huffman code") +JMESSAGE(JWRN_JFIF_MAJOR, "Warning: unknown JFIF revision number %d.%02d") +JMESSAGE(JWRN_JPEG_EOF, "Premature end of JPEG file") +JMESSAGE(JWRN_MUST_RESYNC, + "Corrupt JPEG data: found marker 0x%02x instead of RST%d") +JMESSAGE(JWRN_NOT_SEQUENTIAL, "Invalid SOS parameters for sequential JPEG") +JMESSAGE(JWRN_TOO_MUCH_DATA, "Application transferred too many scanlines") +#if JPEG_LIB_VERSION < 70 +JMESSAGE(JERR_BAD_CROP_SPEC, "Invalid crop request") +#if defined(C_ARITH_CODING_SUPPORTED) || defined(D_ARITH_CODING_SUPPORTED) +JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined") +JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code") +#endif +#endif + +#ifdef JMAKE_ENUM_LIST + + JMSG_LASTMSGCODE +} J_MESSAGE_CODE; + +#undef JMAKE_ENUM_LIST +#endif /* JMAKE_ENUM_LIST */ + +/* Zap JMESSAGE macro so that future re-inclusions do nothing by default */ +#undef JMESSAGE + + +#ifndef JERROR_H +#define JERROR_H + +/* Macros to simplify using the error and trace message stuff */ +/* The first parameter is either type of cinfo pointer */ + +/* Fatal errors (print message and exit) */ +#define ERREXIT(cinfo,code) \ + ((cinfo)->err->msg_code = (code), \ + (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo))) +#define ERREXIT1(cinfo,code,p1) \ + ((cinfo)->err->msg_code = (code), \ + (cinfo)->err->msg_parm.i[0] = (p1), \ + (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo))) +#define ERREXIT2(cinfo,code,p1,p2) \ + ((cinfo)->err->msg_code = (code), \ + (cinfo)->err->msg_parm.i[0] = (p1), \ + (cinfo)->err->msg_parm.i[1] = (p2), \ + (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo))) +#define ERREXIT3(cinfo,code,p1,p2,p3) \ + ((cinfo)->err->msg_code = (code), \ + (cinfo)->err->msg_parm.i[0] = (p1), \ + (cinfo)->err->msg_parm.i[1] = (p2), \ + (cinfo)->err->msg_parm.i[2] = (p3), \ + (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo))) +#define ERREXIT4(cinfo,code,p1,p2,p3,p4) \ + ((cinfo)->err->msg_code = (code), \ + (cinfo)->err->msg_parm.i[0] = (p1), \ + (cinfo)->err->msg_parm.i[1] = (p2), \ + (cinfo)->err->msg_parm.i[2] = (p3), \ + (cinfo)->err->msg_parm.i[3] = (p4), \ + (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo))) +#define ERREXITS(cinfo,code,str) \ + ((cinfo)->err->msg_code = (code), \ + strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \ + (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo))) + +#define MAKESTMT(stuff) do { stuff } while (0) + +/* Nonfatal errors (we can keep going, but the data is probably corrupt) */ +#define WARNMS(cinfo,code) \ + ((cinfo)->err->msg_code = (code), \ + (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), -1)) +#define WARNMS1(cinfo,code,p1) \ + ((cinfo)->err->msg_code = (code), \ + (cinfo)->err->msg_parm.i[0] = (p1), \ + (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), -1)) +#define WARNMS2(cinfo,code,p1,p2) \ + ((cinfo)->err->msg_code = (code), \ + (cinfo)->err->msg_parm.i[0] = (p1), \ + (cinfo)->err->msg_parm.i[1] = (p2), \ + (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), -1)) + +/* Informational/debugging messages */ +#define TRACEMS(cinfo,lvl,code) \ + ((cinfo)->err->msg_code = (code), \ + (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl))) +#define TRACEMS1(cinfo,lvl,code,p1) \ + ((cinfo)->err->msg_code = (code), \ + (cinfo)->err->msg_parm.i[0] = (p1), \ + (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl))) +#define TRACEMS2(cinfo,lvl,code,p1,p2) \ + ((cinfo)->err->msg_code = (code), \ + (cinfo)->err->msg_parm.i[0] = (p1), \ + (cinfo)->err->msg_parm.i[1] = (p2), \ + (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl))) +#define TRACEMS3(cinfo,lvl,code,p1,p2,p3) \ + MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \ + _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); \ + (cinfo)->err->msg_code = (code); \ + (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); ) +#define TRACEMS4(cinfo,lvl,code,p1,p2,p3,p4) \ + MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \ + _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \ + (cinfo)->err->msg_code = (code); \ + (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); ) +#define TRACEMS5(cinfo,lvl,code,p1,p2,p3,p4,p5) \ + MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \ + _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \ + _mp[4] = (p5); \ + (cinfo)->err->msg_code = (code); \ + (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); ) +#define TRACEMS8(cinfo,lvl,code,p1,p2,p3,p4,p5,p6,p7,p8) \ + MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \ + _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \ + _mp[4] = (p5); _mp[5] = (p6); _mp[6] = (p7); _mp[7] = (p8); \ + (cinfo)->err->msg_code = (code); \ + (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); ) +#define TRACEMSS(cinfo,lvl,code,str) \ + ((cinfo)->err->msg_code = (code), \ + strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \ + (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl))) + +#endif /* JERROR_H */ diff --git a/media/libjpeg/jfdctflt.c b/media/libjpeg/jfdctflt.c new file mode 100644 index 000000000..b3da3ebda --- /dev/null +++ b/media/libjpeg/jfdctflt.c @@ -0,0 +1,169 @@ +/* + * jfdctflt.c + * + * Copyright (C) 1994-1996, Thomas G. Lane. + * This file is part of the Independent JPEG Group's software. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains a floating-point implementation of the + * forward DCT (Discrete Cosine Transform). + * + * This implementation should be more accurate than either of the integer + * DCT implementations. However, it may not give the same results on all + * machines because of differences in roundoff behavior. Speed will depend + * on the hardware's floating point capacity. + * + * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT + * on each column. Direct algorithms are also available, but they are + * much more complex and seem not to be any faster when reduced to code. + * + * This implementation is based on Arai, Agui, and Nakajima's algorithm for + * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in + * Japanese, but the algorithm is described in the Pennebaker & Mitchell + * JPEG textbook (see REFERENCES section in file README.ijg). The following + * code is based directly on figure 4-8 in P&M. + * While an 8-point DCT cannot be done in less than 11 multiplies, it is + * possible to arrange the computation so that many of the multiplies are + * simple scalings of the final outputs. These multiplies can then be + * folded into the multiplications or divisions by the JPEG quantization + * table entries. The AA&N method leaves only 5 multiplies and 29 adds + * to be done in the DCT itself. + * The primary disadvantage of this method is that with a fixed-point + * implementation, accuracy is lost due to imprecise representation of the + * scaled quantization values. However, that problem does not arise if + * we use floating point arithmetic. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jdct.h" /* Private declarations for DCT subsystem */ + +#ifdef DCT_FLOAT_SUPPORTED + + +/* + * This module is specialized to the case DCTSIZE = 8. + */ + +#if DCTSIZE != 8 + Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ +#endif + + +/* + * Perform the forward DCT on one block of samples. + */ + +GLOBAL(void) +jpeg_fdct_float (FAST_FLOAT *data) +{ + FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + FAST_FLOAT tmp10, tmp11, tmp12, tmp13; + FAST_FLOAT z1, z2, z3, z4, z5, z11, z13; + FAST_FLOAT *dataptr; + int ctr; + + /* Pass 1: process rows. */ + + dataptr = data; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + tmp0 = dataptr[0] + dataptr[7]; + tmp7 = dataptr[0] - dataptr[7]; + tmp1 = dataptr[1] + dataptr[6]; + tmp6 = dataptr[1] - dataptr[6]; + tmp2 = dataptr[2] + dataptr[5]; + tmp5 = dataptr[2] - dataptr[5]; + tmp3 = dataptr[3] + dataptr[4]; + tmp4 = dataptr[3] - dataptr[4]; + + /* Even part */ + + tmp10 = tmp0 + tmp3; /* phase 2 */ + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + dataptr[0] = tmp10 + tmp11; /* phase 3 */ + dataptr[4] = tmp10 - tmp11; + + z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */ + dataptr[2] = tmp13 + z1; /* phase 5 */ + dataptr[6] = tmp13 - z1; + + /* Odd part */ + + tmp10 = tmp4 + tmp5; /* phase 2 */ + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; + + /* The rotator is modified from fig 4-8 to avoid extra negations. */ + z5 = (tmp10 - tmp12) * ((FAST_FLOAT) 0.382683433); /* c6 */ + z2 = ((FAST_FLOAT) 0.541196100) * tmp10 + z5; /* c2-c6 */ + z4 = ((FAST_FLOAT) 1.306562965) * tmp12 + z5; /* c2+c6 */ + z3 = tmp11 * ((FAST_FLOAT) 0.707106781); /* c4 */ + + z11 = tmp7 + z3; /* phase 5 */ + z13 = tmp7 - z3; + + dataptr[5] = z13 + z2; /* phase 6 */ + dataptr[3] = z13 - z2; + dataptr[1] = z11 + z4; + dataptr[7] = z11 - z4; + + dataptr += DCTSIZE; /* advance pointer to next row */ + } + + /* Pass 2: process columns. */ + + dataptr = data; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7]; + tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7]; + tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6]; + tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6]; + tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5]; + tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5]; + tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4]; + tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4]; + + /* Even part */ + + tmp10 = tmp0 + tmp3; /* phase 2 */ + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */ + dataptr[DCTSIZE*4] = tmp10 - tmp11; + + z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */ + dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */ + dataptr[DCTSIZE*6] = tmp13 - z1; + + /* Odd part */ + + tmp10 = tmp4 + tmp5; /* phase 2 */ + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; + + /* The rotator is modified from fig 4-8 to avoid extra negations. */ + z5 = (tmp10 - tmp12) * ((FAST_FLOAT) 0.382683433); /* c6 */ + z2 = ((FAST_FLOAT) 0.541196100) * tmp10 + z5; /* c2-c6 */ + z4 = ((FAST_FLOAT) 1.306562965) * tmp12 + z5; /* c2+c6 */ + z3 = tmp11 * ((FAST_FLOAT) 0.707106781); /* c4 */ + + z11 = tmp7 + z3; /* phase 5 */ + z13 = tmp7 - z3; + + dataptr[DCTSIZE*5] = z13 + z2; /* phase 6 */ + dataptr[DCTSIZE*3] = z13 - z2; + dataptr[DCTSIZE*1] = z11 + z4; + dataptr[DCTSIZE*7] = z11 - z4; + + dataptr++; /* advance pointer to next column */ + } +} + +#endif /* DCT_FLOAT_SUPPORTED */ diff --git a/media/libjpeg/jfdctfst.c b/media/libjpeg/jfdctfst.c new file mode 100644 index 000000000..5cd83a7b8 --- /dev/null +++ b/media/libjpeg/jfdctfst.c @@ -0,0 +1,227 @@ +/* + * jfdctfst.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1994-1996, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2015, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains a fast, not so accurate integer implementation of the + * forward DCT (Discrete Cosine Transform). + * + * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT + * on each column. Direct algorithms are also available, but they are + * much more complex and seem not to be any faster when reduced to code. + * + * This implementation is based on Arai, Agui, and Nakajima's algorithm for + * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in + * Japanese, but the algorithm is described in the Pennebaker & Mitchell + * JPEG textbook (see REFERENCES section in file README.ijg). The following + * code is based directly on figure 4-8 in P&M. + * While an 8-point DCT cannot be done in less than 11 multiplies, it is + * possible to arrange the computation so that many of the multiplies are + * simple scalings of the final outputs. These multiplies can then be + * folded into the multiplications or divisions by the JPEG quantization + * table entries. The AA&N method leaves only 5 multiplies and 29 adds + * to be done in the DCT itself. + * The primary disadvantage of this method is that with fixed-point math, + * accuracy is lost due to imprecise representation of the scaled + * quantization values. The smaller the quantization table entry, the less + * precise the scaled value, so this implementation does worse with high- + * quality-setting files than with low-quality ones. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jdct.h" /* Private declarations for DCT subsystem */ + +#ifdef DCT_IFAST_SUPPORTED + + +/* + * This module is specialized to the case DCTSIZE = 8. + */ + +#if DCTSIZE != 8 + Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ +#endif + + +/* Scaling decisions are generally the same as in the LL&M algorithm; + * see jfdctint.c for more details. However, we choose to descale + * (right shift) multiplication products as soon as they are formed, + * rather than carrying additional fractional bits into subsequent additions. + * This compromises accuracy slightly, but it lets us save a few shifts. + * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples) + * everywhere except in the multiplications proper; this saves a good deal + * of work on 16-bit-int machines. + * + * Again to save a few shifts, the intermediate results between pass 1 and + * pass 2 are not upscaled, but are represented only to integral precision. + * + * A final compromise is to represent the multiplicative constants to only + * 8 fractional bits, rather than 13. This saves some shifting work on some + * machines, and may also reduce the cost of multiplication (since there + * are fewer one-bits in the constants). + */ + +#define CONST_BITS 8 + + +/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus + * causing a lot of useless floating-point operations at run time. + * To get around this we use the following pre-calculated constants. + * If you change CONST_BITS you may want to add appropriate values. + * (With a reasonable C compiler, you can just rely on the FIX() macro...) + */ + +#if CONST_BITS == 8 +#define FIX_0_382683433 ((JLONG) 98) /* FIX(0.382683433) */ +#define FIX_0_541196100 ((JLONG) 139) /* FIX(0.541196100) */ +#define FIX_0_707106781 ((JLONG) 181) /* FIX(0.707106781) */ +#define FIX_1_306562965 ((JLONG) 334) /* FIX(1.306562965) */ +#else +#define FIX_0_382683433 FIX(0.382683433) +#define FIX_0_541196100 FIX(0.541196100) +#define FIX_0_707106781 FIX(0.707106781) +#define FIX_1_306562965 FIX(1.306562965) +#endif + + +/* We can gain a little more speed, with a further compromise in accuracy, + * by omitting the addition in a descaling shift. This yields an incorrectly + * rounded result half the time... + */ + +#ifndef USE_ACCURATE_ROUNDING +#undef DESCALE +#define DESCALE(x,n) RIGHT_SHIFT(x, n) +#endif + + +/* Multiply a DCTELEM variable by an JLONG constant, and immediately + * descale to yield a DCTELEM result. + */ + +#define MULTIPLY(var,const) ((DCTELEM) DESCALE((var) * (const), CONST_BITS)) + + +/* + * Perform the forward DCT on one block of samples. + */ + +GLOBAL(void) +jpeg_fdct_ifast (DCTELEM *data) +{ + DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + DCTELEM tmp10, tmp11, tmp12, tmp13; + DCTELEM z1, z2, z3, z4, z5, z11, z13; + DCTELEM *dataptr; + int ctr; + SHIFT_TEMPS + + /* Pass 1: process rows. */ + + dataptr = data; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + tmp0 = dataptr[0] + dataptr[7]; + tmp7 = dataptr[0] - dataptr[7]; + tmp1 = dataptr[1] + dataptr[6]; + tmp6 = dataptr[1] - dataptr[6]; + tmp2 = dataptr[2] + dataptr[5]; + tmp5 = dataptr[2] - dataptr[5]; + tmp3 = dataptr[3] + dataptr[4]; + tmp4 = dataptr[3] - dataptr[4]; + + /* Even part */ + + tmp10 = tmp0 + tmp3; /* phase 2 */ + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + dataptr[0] = tmp10 + tmp11; /* phase 3 */ + dataptr[4] = tmp10 - tmp11; + + z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ + dataptr[2] = tmp13 + z1; /* phase 5 */ + dataptr[6] = tmp13 - z1; + + /* Odd part */ + + tmp10 = tmp4 + tmp5; /* phase 2 */ + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; + + /* The rotator is modified from fig 4-8 to avoid extra negations. */ + z5 = MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */ + z2 = MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */ + z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */ + z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */ + + z11 = tmp7 + z3; /* phase 5 */ + z13 = tmp7 - z3; + + dataptr[5] = z13 + z2; /* phase 6 */ + dataptr[3] = z13 - z2; + dataptr[1] = z11 + z4; + dataptr[7] = z11 - z4; + + dataptr += DCTSIZE; /* advance pointer to next row */ + } + + /* Pass 2: process columns. */ + + dataptr = data; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7]; + tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7]; + tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6]; + tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6]; + tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5]; + tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5]; + tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4]; + tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4]; + + /* Even part */ + + tmp10 = tmp0 + tmp3; /* phase 2 */ + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */ + dataptr[DCTSIZE*4] = tmp10 - tmp11; + + z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ + dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */ + dataptr[DCTSIZE*6] = tmp13 - z1; + + /* Odd part */ + + tmp10 = tmp4 + tmp5; /* phase 2 */ + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; + + /* The rotator is modified from fig 4-8 to avoid extra negations. */ + z5 = MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */ + z2 = MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */ + z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */ + z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */ + + z11 = tmp7 + z3; /* phase 5 */ + z13 = tmp7 - z3; + + dataptr[DCTSIZE*5] = z13 + z2; /* phase 6 */ + dataptr[DCTSIZE*3] = z13 - z2; + dataptr[DCTSIZE*1] = z11 + z4; + dataptr[DCTSIZE*7] = z11 - z4; + + dataptr++; /* advance pointer to next column */ + } +} + +#endif /* DCT_IFAST_SUPPORTED */ diff --git a/media/libjpeg/jfdctint.c b/media/libjpeg/jfdctint.c new file mode 100644 index 000000000..169bb942c --- /dev/null +++ b/media/libjpeg/jfdctint.c @@ -0,0 +1,286 @@ +/* + * jfdctint.c + * + * This file was part of the Independent JPEG Group's software. + * Copyright (C) 1991-1996, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2015, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains a slow-but-accurate integer implementation of the + * forward DCT (Discrete Cosine Transform). + * + * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT + * on each column. Direct algorithms are also available, but they are + * much more complex and seem not to be any faster when reduced to code. + * + * This implementation is based on an algorithm described in + * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT + * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics, + * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991. + * The primary algorithm described there uses 11 multiplies and 29 adds. + * We use their alternate method with 12 multiplies and 32 adds. + * The advantage of this method is that no data path contains more than one + * multiplication; this allows a very simple and accurate implementation in + * scaled fixed-point arithmetic, with a minimal number of shifts. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jdct.h" /* Private declarations for DCT subsystem */ + +#ifdef DCT_ISLOW_SUPPORTED + + +/* + * This module is specialized to the case DCTSIZE = 8. + */ + +#if DCTSIZE != 8 + Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ +#endif + + +/* + * The poop on this scaling stuff is as follows: + * + * Each 1-D DCT step produces outputs which are a factor of sqrt(N) + * larger than the true DCT outputs. The final outputs are therefore + * a factor of N larger than desired; since N=8 this can be cured by + * a simple right shift at the end of the algorithm. The advantage of + * this arrangement is that we save two multiplications per 1-D DCT, + * because the y0 and y4 outputs need not be divided by sqrt(N). + * In the IJG code, this factor of 8 is removed by the quantization step + * (in jcdctmgr.c), NOT in this module. + * + * We have to do addition and subtraction of the integer inputs, which + * is no problem, and multiplication by fractional constants, which is + * a problem to do in integer arithmetic. We multiply all the constants + * by CONST_SCALE and convert them to integer constants (thus retaining + * CONST_BITS bits of precision in the constants). After doing a + * multiplication we have to divide the product by CONST_SCALE, with proper + * rounding, to produce the correct output. This division can be done + * cheaply as a right shift of CONST_BITS bits. We postpone shifting + * as long as possible so that partial sums can be added together with + * full fractional precision. + * + * The outputs of the first pass are scaled up by PASS1_BITS bits so that + * they are represented to better-than-integral precision. These outputs + * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word + * with the recommended scaling. (For 12-bit sample data, the intermediate + * array is JLONG anyway.) + * + * To avoid overflow of the 32-bit intermediate results in pass 2, we must + * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis + * shows that the values given below are the most effective. + */ + +#if BITS_IN_JSAMPLE == 8 +#define CONST_BITS 13 +#define PASS1_BITS 2 +#else +#define CONST_BITS 13 +#define PASS1_BITS 1 /* lose a little precision to avoid overflow */ +#endif + +/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus + * causing a lot of useless floating-point operations at run time. + * To get around this we use the following pre-calculated constants. + * If you change CONST_BITS you may want to add appropriate values. + * (With a reasonable C compiler, you can just rely on the FIX() macro...) + */ + +#if CONST_BITS == 13 +#define FIX_0_298631336 ((JLONG) 2446) /* FIX(0.298631336) */ +#define FIX_0_390180644 ((JLONG) 3196) /* FIX(0.390180644) */ +#define FIX_0_541196100 ((JLONG) 4433) /* FIX(0.541196100) */ +#define FIX_0_765366865 ((JLONG) 6270) /* FIX(0.765366865) */ +#define FIX_0_899976223 ((JLONG) 7373) /* FIX(0.899976223) */ +#define FIX_1_175875602 ((JLONG) 9633) /* FIX(1.175875602) */ +#define FIX_1_501321110 ((JLONG) 12299) /* FIX(1.501321110) */ +#define FIX_1_847759065 ((JLONG) 15137) /* FIX(1.847759065) */ +#define FIX_1_961570560 ((JLONG) 16069) /* FIX(1.961570560) */ +#define FIX_2_053119869 ((JLONG) 16819) /* FIX(2.053119869) */ +#define FIX_2_562915447 ((JLONG) 20995) /* FIX(2.562915447) */ +#define FIX_3_072711026 ((JLONG) 25172) /* FIX(3.072711026) */ +#else +#define FIX_0_298631336 FIX(0.298631336) +#define FIX_0_390180644 FIX(0.390180644) +#define FIX_0_541196100 FIX(0.541196100) +#define FIX_0_765366865 FIX(0.765366865) +#define FIX_0_899976223 FIX(0.899976223) +#define FIX_1_175875602 FIX(1.175875602) +#define FIX_1_501321110 FIX(1.501321110) +#define FIX_1_847759065 FIX(1.847759065) +#define FIX_1_961570560 FIX(1.961570560) +#define FIX_2_053119869 FIX(2.053119869) +#define FIX_2_562915447 FIX(2.562915447) +#define FIX_3_072711026 FIX(3.072711026) +#endif + + +/* Multiply an JLONG variable by an JLONG constant to yield an JLONG result. + * For 8-bit samples with the recommended scaling, all the variable + * and constant values involved are no more than 16 bits wide, so a + * 16x16->32 bit multiply can be used instead of a full 32x32 multiply. + * For 12-bit samples, a full 32-bit multiplication will be needed. + */ + +#if BITS_IN_JSAMPLE == 8 +#define MULTIPLY(var,const) MULTIPLY16C16(var,const) +#else +#define MULTIPLY(var,const) ((var) * (const)) +#endif + + +/* + * Perform the forward DCT on one block of samples. + */ + +GLOBAL(void) +jpeg_fdct_islow (DCTELEM *data) +{ + JLONG tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + JLONG tmp10, tmp11, tmp12, tmp13; + JLONG z1, z2, z3, z4, z5; + DCTELEM *dataptr; + int ctr; + SHIFT_TEMPS + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + + dataptr = data; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + tmp0 = dataptr[0] + dataptr[7]; + tmp7 = dataptr[0] - dataptr[7]; + tmp1 = dataptr[1] + dataptr[6]; + tmp6 = dataptr[1] - dataptr[6]; + tmp2 = dataptr[2] + dataptr[5]; + tmp5 = dataptr[2] - dataptr[5]; + tmp3 = dataptr[3] + dataptr[4]; + tmp4 = dataptr[3] - dataptr[4]; + + /* Even part per LL&M figure 1 --- note that published figure is faulty; + * rotator "sqrt(2)*c1" should be "sqrt(2)*c6". + */ + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); + dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); + + z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); + dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865), + CONST_BITS-PASS1_BITS); + dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065), + CONST_BITS-PASS1_BITS); + + /* Odd part per figure 8 --- note paper omits factor of sqrt(2). + * cK represents cos(K*pi/16). + * i0..i3 in the paper are tmp4..tmp7 here. + */ + + z1 = tmp4 + tmp7; + z2 = tmp5 + tmp6; + z3 = tmp4 + tmp6; + z4 = tmp5 + tmp7; + z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ + + tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ + tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ + tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ + tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ + z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ + z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ + z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ + z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ + + z3 += z5; + z4 += z5; + + dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); + dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); + dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); + dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); + + dataptr += DCTSIZE; /* advance pointer to next row */ + } + + /* Pass 2: process columns. + * We remove the PASS1_BITS scaling, but leave the results scaled up + * by an overall factor of 8. + */ + + dataptr = data; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7]; + tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7]; + tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6]; + tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6]; + tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5]; + tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5]; + tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4]; + tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4]; + + /* Even part per LL&M figure 1 --- note that published figure is faulty; + * rotator "sqrt(2)*c1" should be "sqrt(2)*c6". + */ + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS); + dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS); + + z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); + dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865), + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065), + CONST_BITS+PASS1_BITS); + + /* Odd part per figure 8 --- note paper omits factor of sqrt(2). + * cK represents cos(K*pi/16). + * i0..i3 in the paper are tmp4..tmp7 here. + */ + + z1 = tmp4 + tmp7; + z2 = tmp5 + tmp6; + z3 = tmp4 + tmp6; + z4 = tmp5 + tmp7; + z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ + + tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ + tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ + tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ + tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ + z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ + z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ + z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ + z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ + + z3 += z5; + z4 += z5; + + dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, + CONST_BITS+PASS1_BITS); + + dataptr++; /* advance pointer to next column */ + } +} + +#endif /* DCT_ISLOW_SUPPORTED */ diff --git a/media/libjpeg/jidctflt.c b/media/libjpeg/jidctflt.c new file mode 100644 index 000000000..68c521ed7 --- /dev/null +++ b/media/libjpeg/jidctflt.c @@ -0,0 +1,240 @@ +/* + * jidctflt.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1994-1998, Thomas G. Lane. + * Modified 2010 by Guido Vollbeding. + * libjpeg-turbo Modifications: + * Copyright (C) 2014, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains a floating-point implementation of the + * inverse DCT (Discrete Cosine Transform). In the IJG code, this routine + * must also perform dequantization of the input coefficients. + * + * This implementation should be more accurate than either of the integer + * IDCT implementations. However, it may not give the same results on all + * machines because of differences in roundoff behavior. Speed will depend + * on the hardware's floating point capacity. + * + * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT + * on each row (or vice versa, but it's more convenient to emit a row at + * a time). Direct algorithms are also available, but they are much more + * complex and seem not to be any faster when reduced to code. + * + * This implementation is based on Arai, Agui, and Nakajima's algorithm for + * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in + * Japanese, but the algorithm is described in the Pennebaker & Mitchell + * JPEG textbook (see REFERENCES section in file README.ijg). The following + * code is based directly on figure 4-8 in P&M. + * While an 8-point DCT cannot be done in less than 11 multiplies, it is + * possible to arrange the computation so that many of the multiplies are + * simple scalings of the final outputs. These multiplies can then be + * folded into the multiplications or divisions by the JPEG quantization + * table entries. The AA&N method leaves only 5 multiplies and 29 adds + * to be done in the DCT itself. + * The primary disadvantage of this method is that with a fixed-point + * implementation, accuracy is lost due to imprecise representation of the + * scaled quantization values. However, that problem does not arise if + * we use floating point arithmetic. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jdct.h" /* Private declarations for DCT subsystem */ + +#ifdef DCT_FLOAT_SUPPORTED + + +/* + * This module is specialized to the case DCTSIZE = 8. + */ + +#if DCTSIZE != 8 + Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ +#endif + + +/* Dequantize a coefficient by multiplying it by the multiplier-table + * entry; produce a float result. + */ + +#define DEQUANTIZE(coef,quantval) (((FAST_FLOAT) (coef)) * (quantval)) + + +/* + * Perform dequantization and inverse DCT on one block of coefficients. + */ + +GLOBAL(void) +jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + FAST_FLOAT tmp10, tmp11, tmp12, tmp13; + FAST_FLOAT z5, z10, z11, z12, z13; + JCOEFPTR inptr; + FLOAT_MULT_TYPE *quantptr; + FAST_FLOAT *wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = cinfo->sample_range_limit; + int ctr; + FAST_FLOAT workspace[DCTSIZE2]; /* buffers data between passes */ + #define _0_125 ((FLOAT_MULT_TYPE)0.125) + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (FLOAT_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = DCTSIZE; ctr > 0; ctr--) { + /* Due to quantization, we will usually find that many of the input + * coefficients are zero, especially the AC terms. We can exploit this + * by short-circuiting the IDCT calculation for any column in which all + * the AC terms are zero. In that case each output is equal to the + * DC coefficient (with scale factor as needed). + * With typical images and quantization tables, half or more of the + * column DCT calculations can be simplified this way. + */ + + if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 && + inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 && + inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 && + inptr[DCTSIZE*7] == 0) { + /* AC terms all zero */ + FAST_FLOAT dcval = DEQUANTIZE(inptr[DCTSIZE*0], + quantptr[DCTSIZE*0] * _0_125); + + wsptr[DCTSIZE*0] = dcval; + wsptr[DCTSIZE*1] = dcval; + wsptr[DCTSIZE*2] = dcval; + wsptr[DCTSIZE*3] = dcval; + wsptr[DCTSIZE*4] = dcval; + wsptr[DCTSIZE*5] = dcval; + wsptr[DCTSIZE*6] = dcval; + wsptr[DCTSIZE*7] = dcval; + + inptr++; /* advance pointers to next column */ + quantptr++; + wsptr++; + continue; + } + + /* Even part */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0] * _0_125); + tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2] * _0_125); + tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4] * _0_125); + tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6] * _0_125); + + tmp10 = tmp0 + tmp2; /* phase 3 */ + tmp11 = tmp0 - tmp2; + + tmp13 = tmp1 + tmp3; /* phases 5-3 */ + tmp12 = (tmp1 - tmp3) * ((FAST_FLOAT) 1.414213562) - tmp13; /* 2*c4 */ + + tmp0 = tmp10 + tmp13; /* phase 2 */ + tmp3 = tmp10 - tmp13; + tmp1 = tmp11 + tmp12; + tmp2 = tmp11 - tmp12; + + /* Odd part */ + + tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1] * _0_125); + tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3] * _0_125); + tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5] * _0_125); + tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7] * _0_125); + + z13 = tmp6 + tmp5; /* phase 6 */ + z10 = tmp6 - tmp5; + z11 = tmp4 + tmp7; + z12 = tmp4 - tmp7; + + tmp7 = z11 + z13; /* phase 5 */ + tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562); /* 2*c4 */ + + z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */ + tmp10 = z5 - z12 * ((FAST_FLOAT) 1.082392200); /* 2*(c2-c6) */ + tmp12 = z5 - z10 * ((FAST_FLOAT) 2.613125930); /* 2*(c2+c6) */ + + tmp6 = tmp12 - tmp7; /* phase 2 */ + tmp5 = tmp11 - tmp6; + tmp4 = tmp10 - tmp5; + + wsptr[DCTSIZE*0] = tmp0 + tmp7; + wsptr[DCTSIZE*7] = tmp0 - tmp7; + wsptr[DCTSIZE*1] = tmp1 + tmp6; + wsptr[DCTSIZE*6] = tmp1 - tmp6; + wsptr[DCTSIZE*2] = tmp2 + tmp5; + wsptr[DCTSIZE*5] = tmp2 - tmp5; + wsptr[DCTSIZE*3] = tmp3 + tmp4; + wsptr[DCTSIZE*4] = tmp3 - tmp4; + + inptr++; /* advance pointers to next column */ + quantptr++; + wsptr++; + } + + /* Pass 2: process rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < DCTSIZE; ctr++) { + outptr = output_buf[ctr] + output_col; + /* Rows of zeroes can be exploited in the same way as we did with columns. + * However, the column calculation has created many nonzero AC terms, so + * the simplification applies less often (typically 5% to 10% of the time). + * And testing floats for zero is relatively expensive, so we don't bother. + */ + + /* Even part */ + + /* Apply signed->unsigned and prepare float->int conversion */ + z5 = wsptr[0] + ((FAST_FLOAT) CENTERJSAMPLE + (FAST_FLOAT) 0.5); + tmp10 = z5 + wsptr[4]; + tmp11 = z5 - wsptr[4]; + + tmp13 = wsptr[2] + wsptr[6]; + tmp12 = (wsptr[2] - wsptr[6]) * ((FAST_FLOAT) 1.414213562) - tmp13; + + tmp0 = tmp10 + tmp13; + tmp3 = tmp10 - tmp13; + tmp1 = tmp11 + tmp12; + tmp2 = tmp11 - tmp12; + + /* Odd part */ + + z13 = wsptr[5] + wsptr[3]; + z10 = wsptr[5] - wsptr[3]; + z11 = wsptr[1] + wsptr[7]; + z12 = wsptr[1] - wsptr[7]; + + tmp7 = z11 + z13; + tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562); + + z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */ + tmp10 = z5 - z12 * ((FAST_FLOAT) 1.082392200); /* 2*(c2-c6) */ + tmp12 = z5 - z10 * ((FAST_FLOAT) 2.613125930); /* 2*(c2+c6) */ + + tmp6 = tmp12 - tmp7; + tmp5 = tmp11 - tmp6; + tmp4 = tmp10 - tmp5; + + /* Final output stage: float->int conversion and range-limit */ + + outptr[0] = range_limit[((int) (tmp0 + tmp7)) & RANGE_MASK]; + outptr[7] = range_limit[((int) (tmp0 - tmp7)) & RANGE_MASK]; + outptr[1] = range_limit[((int) (tmp1 + tmp6)) & RANGE_MASK]; + outptr[6] = range_limit[((int) (tmp1 - tmp6)) & RANGE_MASK]; + outptr[2] = range_limit[((int) (tmp2 + tmp5)) & RANGE_MASK]; + outptr[5] = range_limit[((int) (tmp2 - tmp5)) & RANGE_MASK]; + outptr[3] = range_limit[((int) (tmp3 + tmp4)) & RANGE_MASK]; + outptr[4] = range_limit[((int) (tmp3 - tmp4)) & RANGE_MASK]; + + wsptr += DCTSIZE; /* advance pointer to next row */ + } +} + +#endif /* DCT_FLOAT_SUPPORTED */ diff --git a/media/libjpeg/jidctfst.c b/media/libjpeg/jidctfst.c new file mode 100644 index 000000000..10db739b8 --- /dev/null +++ b/media/libjpeg/jidctfst.c @@ -0,0 +1,371 @@ +/* + * jidctfst.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1994-1998, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2015, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains a fast, not so accurate integer implementation of the + * inverse DCT (Discrete Cosine Transform). In the IJG code, this routine + * must also perform dequantization of the input coefficients. + * + * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT + * on each row (or vice versa, but it's more convenient to emit a row at + * a time). Direct algorithms are also available, but they are much more + * complex and seem not to be any faster when reduced to code. + * + * This implementation is based on Arai, Agui, and Nakajima's algorithm for + * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in + * Japanese, but the algorithm is described in the Pennebaker & Mitchell + * JPEG textbook (see REFERENCES section in file README.ijg). The following + * code is based directly on figure 4-8 in P&M. + * While an 8-point DCT cannot be done in less than 11 multiplies, it is + * possible to arrange the computation so that many of the multiplies are + * simple scalings of the final outputs. These multiplies can then be + * folded into the multiplications or divisions by the JPEG quantization + * table entries. The AA&N method leaves only 5 multiplies and 29 adds + * to be done in the DCT itself. + * The primary disadvantage of this method is that with fixed-point math, + * accuracy is lost due to imprecise representation of the scaled + * quantization values. The smaller the quantization table entry, the less + * precise the scaled value, so this implementation does worse with high- + * quality-setting files than with low-quality ones. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jdct.h" /* Private declarations for DCT subsystem */ + +#ifdef DCT_IFAST_SUPPORTED + + +/* + * This module is specialized to the case DCTSIZE = 8. + */ + +#if DCTSIZE != 8 + Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ +#endif + + +/* Scaling decisions are generally the same as in the LL&M algorithm; + * see jidctint.c for more details. However, we choose to descale + * (right shift) multiplication products as soon as they are formed, + * rather than carrying additional fractional bits into subsequent additions. + * This compromises accuracy slightly, but it lets us save a few shifts. + * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples) + * everywhere except in the multiplications proper; this saves a good deal + * of work on 16-bit-int machines. + * + * The dequantized coefficients are not integers because the AA&N scaling + * factors have been incorporated. We represent them scaled up by PASS1_BITS, + * so that the first and second IDCT rounds have the same input scaling. + * For 8-bit JSAMPLEs, we choose IFAST_SCALE_BITS = PASS1_BITS so as to + * avoid a descaling shift; this compromises accuracy rather drastically + * for small quantization table entries, but it saves a lot of shifts. + * For 12-bit JSAMPLEs, there's no hope of using 16x16 multiplies anyway, + * so we use a much larger scaling factor to preserve accuracy. + * + * A final compromise is to represent the multiplicative constants to only + * 8 fractional bits, rather than 13. This saves some shifting work on some + * machines, and may also reduce the cost of multiplication (since there + * are fewer one-bits in the constants). + */ + +#if BITS_IN_JSAMPLE == 8 +#define CONST_BITS 8 +#define PASS1_BITS 2 +#else +#define CONST_BITS 8 +#define PASS1_BITS 1 /* lose a little precision to avoid overflow */ +#endif + +/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus + * causing a lot of useless floating-point operations at run time. + * To get around this we use the following pre-calculated constants. + * If you change CONST_BITS you may want to add appropriate values. + * (With a reasonable C compiler, you can just rely on the FIX() macro...) + */ + +#if CONST_BITS == 8 +#define FIX_1_082392200 ((JLONG) 277) /* FIX(1.082392200) */ +#define FIX_1_414213562 ((JLONG) 362) /* FIX(1.414213562) */ +#define FIX_1_847759065 ((JLONG) 473) /* FIX(1.847759065) */ +#define FIX_2_613125930 ((JLONG) 669) /* FIX(2.613125930) */ +#else +#define FIX_1_082392200 FIX(1.082392200) +#define FIX_1_414213562 FIX(1.414213562) +#define FIX_1_847759065 FIX(1.847759065) +#define FIX_2_613125930 FIX(2.613125930) +#endif + + +/* We can gain a little more speed, with a further compromise in accuracy, + * by omitting the addition in a descaling shift. This yields an incorrectly + * rounded result half the time... + */ + +#ifndef USE_ACCURATE_ROUNDING +#undef DESCALE +#define DESCALE(x,n) RIGHT_SHIFT(x, n) +#endif + + +/* Multiply a DCTELEM variable by an JLONG constant, and immediately + * descale to yield a DCTELEM result. + */ + +#define MULTIPLY(var,const) ((DCTELEM) DESCALE((var) * (const), CONST_BITS)) + + +/* Dequantize a coefficient by multiplying it by the multiplier-table + * entry; produce a DCTELEM result. For 8-bit data a 16x16->16 + * multiplication will do. For 12-bit data, the multiplier table is + * declared JLONG, so a 32-bit multiply will be used. + */ + +#if BITS_IN_JSAMPLE == 8 +#define DEQUANTIZE(coef,quantval) (((IFAST_MULT_TYPE) (coef)) * (quantval)) +#else +#define DEQUANTIZE(coef,quantval) \ + DESCALE((coef)*(quantval), IFAST_SCALE_BITS-PASS1_BITS) +#endif + + +/* Like DESCALE, but applies to a DCTELEM and produces an int. + * We assume that int right shift is unsigned if JLONG right shift is. + */ + +#ifdef RIGHT_SHIFT_IS_UNSIGNED +#define ISHIFT_TEMPS DCTELEM ishift_temp; +#if BITS_IN_JSAMPLE == 8 +#define DCTELEMBITS 16 /* DCTELEM may be 16 or 32 bits */ +#else +#define DCTELEMBITS 32 /* DCTELEM must be 32 bits */ +#endif +#define IRIGHT_SHIFT(x,shft) \ + ((ishift_temp = (x)) < 0 ? \ + (ishift_temp >> (shft)) | ((~((DCTELEM) 0)) << (DCTELEMBITS-(shft))) : \ + (ishift_temp >> (shft))) +#else +#define ISHIFT_TEMPS +#define IRIGHT_SHIFT(x,shft) ((x) >> (shft)) +#endif + +#ifdef USE_ACCURATE_ROUNDING +#define IDESCALE(x,n) ((int) IRIGHT_SHIFT((x) + (1 << ((n)-1)), n)) +#else +#define IDESCALE(x,n) ((int) IRIGHT_SHIFT(x, n)) +#endif + + +/* + * Perform dequantization and inverse DCT on one block of coefficients. + */ + +GLOBAL(void) +jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + DCTELEM tmp10, tmp11, tmp12, tmp13; + DCTELEM z5, z10, z11, z12, z13; + JCOEFPTR inptr; + IFAST_MULT_TYPE *quantptr; + int *wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[DCTSIZE2]; /* buffers data between passes */ + SHIFT_TEMPS /* for DESCALE */ + ISHIFT_TEMPS /* for IDESCALE */ + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (IFAST_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = DCTSIZE; ctr > 0; ctr--) { + /* Due to quantization, we will usually find that many of the input + * coefficients are zero, especially the AC terms. We can exploit this + * by short-circuiting the IDCT calculation for any column in which all + * the AC terms are zero. In that case each output is equal to the + * DC coefficient (with scale factor as needed). + * With typical images and quantization tables, half or more of the + * column DCT calculations can be simplified this way. + */ + + if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 && + inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 && + inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 && + inptr[DCTSIZE*7] == 0) { + /* AC terms all zero */ + int dcval = (int) DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + + wsptr[DCTSIZE*0] = dcval; + wsptr[DCTSIZE*1] = dcval; + wsptr[DCTSIZE*2] = dcval; + wsptr[DCTSIZE*3] = dcval; + wsptr[DCTSIZE*4] = dcval; + wsptr[DCTSIZE*5] = dcval; + wsptr[DCTSIZE*6] = dcval; + wsptr[DCTSIZE*7] = dcval; + + inptr++; /* advance pointers to next column */ + quantptr++; + wsptr++; + continue; + } + + /* Even part */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + + tmp10 = tmp0 + tmp2; /* phase 3 */ + tmp11 = tmp0 - tmp2; + + tmp13 = tmp1 + tmp3; /* phases 5-3 */ + tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */ + + tmp0 = tmp10 + tmp13; /* phase 2 */ + tmp3 = tmp10 - tmp13; + tmp1 = tmp11 + tmp12; + tmp2 = tmp11 - tmp12; + + /* Odd part */ + + tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + + z13 = tmp6 + tmp5; /* phase 6 */ + z10 = tmp6 - tmp5; + z11 = tmp4 + tmp7; + z12 = tmp4 - tmp7; + + tmp7 = z11 + z13; /* phase 5 */ + tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ + + z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */ + tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */ + tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */ + + tmp6 = tmp12 - tmp7; /* phase 2 */ + tmp5 = tmp11 - tmp6; + tmp4 = tmp10 + tmp5; + + wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); + wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); + wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); + wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); + wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5); + wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); + wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); + wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); + + inptr++; /* advance pointers to next column */ + quantptr++; + wsptr++; + } + + /* Pass 2: process rows from work array, store into output array. */ + /* Note that we must descale the results by a factor of 8 == 2**3, */ + /* and also undo the PASS1_BITS scaling. */ + + wsptr = workspace; + for (ctr = 0; ctr < DCTSIZE; ctr++) { + outptr = output_buf[ctr] + output_col; + /* Rows of zeroes can be exploited in the same way as we did with columns. + * However, the column calculation has created many nonzero AC terms, so + * the simplification applies less often (typically 5% to 10% of the time). + * On machines with very fast multiplication, it's possible that the + * test takes more time than it's worth. In that case this section + * may be commented out. + */ + +#ifndef NO_ZERO_ROW_TEST + if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 && + wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) { + /* AC terms all zero */ + JSAMPLE dcval = range_limit[IDESCALE(wsptr[0], PASS1_BITS+3) + & RANGE_MASK]; + + outptr[0] = dcval; + outptr[1] = dcval; + outptr[2] = dcval; + outptr[3] = dcval; + outptr[4] = dcval; + outptr[5] = dcval; + outptr[6] = dcval; + outptr[7] = dcval; + + wsptr += DCTSIZE; /* advance pointer to next row */ + continue; + } +#endif + + /* Even part */ + + tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); + tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); + + tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); + tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) + - tmp13; + + tmp0 = tmp10 + tmp13; + tmp3 = tmp10 - tmp13; + tmp1 = tmp11 + tmp12; + tmp2 = tmp11 - tmp12; + + /* Odd part */ + + z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; + z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; + z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; + z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; + + tmp7 = z11 + z13; /* phase 5 */ + tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ + + z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */ + tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */ + tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */ + + tmp6 = tmp12 - tmp7; /* phase 2 */ + tmp5 = tmp11 - tmp6; + tmp4 = tmp10 + tmp5; + + /* Final output stage: scale down by a factor of 8 and range-limit */ + + outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) + & RANGE_MASK]; + outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += DCTSIZE; /* advance pointer to next row */ + } +} + +#endif /* DCT_IFAST_SUPPORTED */ diff --git a/media/libjpeg/jidctint.c b/media/libjpeg/jidctint.c new file mode 100644 index 000000000..3ac6caf69 --- /dev/null +++ b/media/libjpeg/jidctint.c @@ -0,0 +1,2627 @@ +/* + * jidctint.c + * + * This file was part of the Independent JPEG Group's software. + * Copyright (C) 1991-1998, Thomas G. Lane. + * Modification developed 2002-2009 by Guido Vollbeding. + * libjpeg-turbo Modifications: + * Copyright (C) 2015, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains a slow-but-accurate integer implementation of the + * inverse DCT (Discrete Cosine Transform). In the IJG code, this routine + * must also perform dequantization of the input coefficients. + * + * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT + * on each row (or vice versa, but it's more convenient to emit a row at + * a time). Direct algorithms are also available, but they are much more + * complex and seem not to be any faster when reduced to code. + * + * This implementation is based on an algorithm described in + * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT + * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics, + * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991. + * The primary algorithm described there uses 11 multiplies and 29 adds. + * We use their alternate method with 12 multiplies and 32 adds. + * The advantage of this method is that no data path contains more than one + * multiplication; this allows a very simple and accurate implementation in + * scaled fixed-point arithmetic, with a minimal number of shifts. + * + * We also provide IDCT routines with various output sample block sizes for + * direct resolution reduction or enlargement without additional resampling: + * NxN (N=1...16) pixels for one 8x8 input DCT block. + * + * For N<8 we simply take the corresponding low-frequency coefficients of + * the 8x8 input DCT block and apply an NxN point IDCT on the sub-block + * to yield the downscaled outputs. + * This can be seen as direct low-pass downsampling from the DCT domain + * point of view rather than the usual spatial domain point of view, + * yielding significant computational savings and results at least + * as good as common bilinear (averaging) spatial downsampling. + * + * For N>8 we apply a partial NxN IDCT on the 8 input coefficients as + * lower frequencies and higher frequencies assumed to be zero. + * It turns out that the computational effort is similar to the 8x8 IDCT + * regarding the output size. + * Furthermore, the scaling and descaling is the same for all IDCT sizes. + * + * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases + * since there would be too many additional constants to pre-calculate. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jdct.h" /* Private declarations for DCT subsystem */ + +#ifdef DCT_ISLOW_SUPPORTED + + +/* + * This module is specialized to the case DCTSIZE = 8. + */ + +#if DCTSIZE != 8 + Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */ +#endif + + +/* + * The poop on this scaling stuff is as follows: + * + * Each 1-D IDCT step produces outputs which are a factor of sqrt(N) + * larger than the true IDCT outputs. The final outputs are therefore + * a factor of N larger than desired; since N=8 this can be cured by + * a simple right shift at the end of the algorithm. The advantage of + * this arrangement is that we save two multiplications per 1-D IDCT, + * because the y0 and y4 inputs need not be divided by sqrt(N). + * + * We have to do addition and subtraction of the integer inputs, which + * is no problem, and multiplication by fractional constants, which is + * a problem to do in integer arithmetic. We multiply all the constants + * by CONST_SCALE and convert them to integer constants (thus retaining + * CONST_BITS bits of precision in the constants). After doing a + * multiplication we have to divide the product by CONST_SCALE, with proper + * rounding, to produce the correct output. This division can be done + * cheaply as a right shift of CONST_BITS bits. We postpone shifting + * as long as possible so that partial sums can be added together with + * full fractional precision. + * + * The outputs of the first pass are scaled up by PASS1_BITS bits so that + * they are represented to better-than-integral precision. These outputs + * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word + * with the recommended scaling. (To scale up 12-bit sample data further, an + * intermediate JLONG array would be needed.) + * + * To avoid overflow of the 32-bit intermediate results in pass 2, we must + * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis + * shows that the values given below are the most effective. + */ + +#if BITS_IN_JSAMPLE == 8 +#define CONST_BITS 13 +#define PASS1_BITS 2 +#else +#define CONST_BITS 13 +#define PASS1_BITS 1 /* lose a little precision to avoid overflow */ +#endif + +/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus + * causing a lot of useless floating-point operations at run time. + * To get around this we use the following pre-calculated constants. + * If you change CONST_BITS you may want to add appropriate values. + * (With a reasonable C compiler, you can just rely on the FIX() macro...) + */ + +#if CONST_BITS == 13 +#define FIX_0_298631336 ((JLONG) 2446) /* FIX(0.298631336) */ +#define FIX_0_390180644 ((JLONG) 3196) /* FIX(0.390180644) */ +#define FIX_0_541196100 ((JLONG) 4433) /* FIX(0.541196100) */ +#define FIX_0_765366865 ((JLONG) 6270) /* FIX(0.765366865) */ +#define FIX_0_899976223 ((JLONG) 7373) /* FIX(0.899976223) */ +#define FIX_1_175875602 ((JLONG) 9633) /* FIX(1.175875602) */ +#define FIX_1_501321110 ((JLONG) 12299) /* FIX(1.501321110) */ +#define FIX_1_847759065 ((JLONG) 15137) /* FIX(1.847759065) */ +#define FIX_1_961570560 ((JLONG) 16069) /* FIX(1.961570560) */ +#define FIX_2_053119869 ((JLONG) 16819) /* FIX(2.053119869) */ +#define FIX_2_562915447 ((JLONG) 20995) /* FIX(2.562915447) */ +#define FIX_3_072711026 ((JLONG) 25172) /* FIX(3.072711026) */ +#else +#define FIX_0_298631336 FIX(0.298631336) +#define FIX_0_390180644 FIX(0.390180644) +#define FIX_0_541196100 FIX(0.541196100) +#define FIX_0_765366865 FIX(0.765366865) +#define FIX_0_899976223 FIX(0.899976223) +#define FIX_1_175875602 FIX(1.175875602) +#define FIX_1_501321110 FIX(1.501321110) +#define FIX_1_847759065 FIX(1.847759065) +#define FIX_1_961570560 FIX(1.961570560) +#define FIX_2_053119869 FIX(2.053119869) +#define FIX_2_562915447 FIX(2.562915447) +#define FIX_3_072711026 FIX(3.072711026) +#endif + + +/* Multiply an JLONG variable by an JLONG constant to yield an JLONG result. + * For 8-bit samples with the recommended scaling, all the variable + * and constant values involved are no more than 16 bits wide, so a + * 16x16->32 bit multiply can be used instead of a full 32x32 multiply. + * For 12-bit samples, a full 32-bit multiplication will be needed. + */ + +#if BITS_IN_JSAMPLE == 8 +#define MULTIPLY(var,const) MULTIPLY16C16(var,const) +#else +#define MULTIPLY(var,const) ((var) * (const)) +#endif + + +/* Dequantize a coefficient by multiplying it by the multiplier-table + * entry; produce an int result. In this module, both inputs and result + * are 16 bits or less, so either int or short multiply will work. + */ + +#define DEQUANTIZE(coef,quantval) (((ISLOW_MULT_TYPE) (coef)) * (quantval)) + + +/* + * Perform dequantization and inverse DCT on one block of coefficients. + */ + +GLOBAL(void) +jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + JLONG tmp0, tmp1, tmp2, tmp3; + JLONG tmp10, tmp11, tmp12, tmp13; + JLONG z1, z2, z3, z4, z5; + JCOEFPTR inptr; + ISLOW_MULT_TYPE *quantptr; + int *wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[DCTSIZE2]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + /* Note results are scaled up by sqrt(8) compared to a true IDCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = DCTSIZE; ctr > 0; ctr--) { + /* Due to quantization, we will usually find that many of the input + * coefficients are zero, especially the AC terms. We can exploit this + * by short-circuiting the IDCT calculation for any column in which all + * the AC terms are zero. In that case each output is equal to the + * DC coefficient (with scale factor as needed). + * With typical images and quantization tables, half or more of the + * column DCT calculations can be simplified this way. + */ + + if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 && + inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 && + inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 && + inptr[DCTSIZE*7] == 0) { + /* AC terms all zero */ + int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]), + PASS1_BITS); + + wsptr[DCTSIZE*0] = dcval; + wsptr[DCTSIZE*1] = dcval; + wsptr[DCTSIZE*2] = dcval; + wsptr[DCTSIZE*3] = dcval; + wsptr[DCTSIZE*4] = dcval; + wsptr[DCTSIZE*5] = dcval; + wsptr[DCTSIZE*6] = dcval; + wsptr[DCTSIZE*7] = dcval; + + inptr++; /* advance pointers to next column */ + quantptr++; + wsptr++; + continue; + } + + /* Even part: reverse the even part of the forward DCT. */ + /* The rotator is sqrt(2)*c(-6). */ + + z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + + z1 = MULTIPLY(z2 + z3, FIX_0_541196100); + tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); + tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); + + z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + + tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); + tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + /* Odd part per figure 8; the matrix is unitary and hence its + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. + */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + + z1 = tmp0 + tmp3; + z2 = tmp1 + tmp2; + z3 = tmp0 + tmp2; + z4 = tmp1 + tmp3; + z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ + + tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ + tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ + tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ + tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ + z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ + z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ + z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ + z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ + + z3 += z5; + z4 += z5; + + tmp0 += z1 + z3; + tmp1 += z2 + z4; + tmp2 += z2 + z3; + tmp3 += z1 + z4; + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS); + wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS); + wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS); + wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS); + wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS); + wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS); + wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS); + wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS); + + inptr++; /* advance pointers to next column */ + quantptr++; + wsptr++; + } + + /* Pass 2: process rows from work array, store into output array. */ + /* Note that we must descale the results by a factor of 8 == 2**3, */ + /* and also undo the PASS1_BITS scaling. */ + + wsptr = workspace; + for (ctr = 0; ctr < DCTSIZE; ctr++) { + outptr = output_buf[ctr] + output_col; + /* Rows of zeroes can be exploited in the same way as we did with columns. + * However, the column calculation has created many nonzero AC terms, so + * the simplification applies less often (typically 5% to 10% of the time). + * On machines with very fast multiplication, it's possible that the + * test takes more time than it's worth. In that case this section + * may be commented out. + */ + +#ifndef NO_ZERO_ROW_TEST + if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 && + wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) { + /* AC terms all zero */ + JSAMPLE dcval = range_limit[(int) DESCALE((JLONG) wsptr[0], PASS1_BITS+3) + & RANGE_MASK]; + + outptr[0] = dcval; + outptr[1] = dcval; + outptr[2] = dcval; + outptr[3] = dcval; + outptr[4] = dcval; + outptr[5] = dcval; + outptr[6] = dcval; + outptr[7] = dcval; + + wsptr += DCTSIZE; /* advance pointer to next row */ + continue; + } +#endif + + /* Even part: reverse the even part of the forward DCT. */ + /* The rotator is sqrt(2)*c(-6). */ + + z2 = (JLONG) wsptr[2]; + z3 = (JLONG) wsptr[6]; + + z1 = MULTIPLY(z2 + z3, FIX_0_541196100); + tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); + tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); + + tmp0 = LEFT_SHIFT((JLONG) wsptr[0] + (JLONG) wsptr[4], CONST_BITS); + tmp1 = LEFT_SHIFT((JLONG) wsptr[0] - (JLONG) wsptr[4], CONST_BITS); + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + /* Odd part per figure 8; the matrix is unitary and hence its + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. + */ + + tmp0 = (JLONG) wsptr[7]; + tmp1 = (JLONG) wsptr[5]; + tmp2 = (JLONG) wsptr[3]; + tmp3 = (JLONG) wsptr[1]; + + z1 = tmp0 + tmp3; + z2 = tmp1 + tmp2; + z3 = tmp0 + tmp2; + z4 = tmp1 + tmp3; + z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ + + tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ + tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ + tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ + tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ + z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ + z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ + z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ + z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ + + z3 += z5; + z4 += z5; + + tmp0 += z1 + z3; + tmp1 += z2 + z4; + tmp2 += z2 + z3; + tmp3 += z1 + z4; + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += DCTSIZE; /* advance pointer to next row */ + } +} + +#ifdef IDCT_SCALING_SUPPORTED + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 7x7 output block. + * + * Optimized algorithm with 12 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/14). + */ + +GLOBAL(void) +jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + JLONG tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13; + JLONG z1, z2, z3; + JCOEFPTR inptr; + ISLOW_MULT_TYPE *quantptr; + int *wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[7*7]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp13 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp13 = LEFT_SHIFT(tmp13, CONST_BITS); + /* Add fudge factor here for final descale. */ + tmp13 += ONE << (CONST_BITS-PASS1_BITS-1); + + z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + + tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734)); /* c4 */ + tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123)); /* c6 */ + tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */ + tmp0 = z1 + z3; + z2 -= tmp0; + tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */ + tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536)); /* c2-c4-c6 */ + tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249)); /* c2+c4+c6 */ + tmp13 += MULTIPLY(z2, FIX(1.414213562)); /* c0 */ + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + + tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347)); /* (c3+c1-c5)/2 */ + tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339)); /* (c3+c5-c1)/2 */ + tmp0 = tmp1 - tmp2; + tmp1 += tmp2; + tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276)); /* -c1 */ + tmp1 += tmp2; + z2 = MULTIPLY(z1 + z3, FIX(0.613604268)); /* c5 */ + tmp0 += z2; + tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693)); /* c3+c1-c5 */ + + /* Final output stage */ + + wsptr[7*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS); + wsptr[7*6] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS); + wsptr[7*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS); + wsptr[7*5] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS); + wsptr[7*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS); + wsptr[7*4] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS); + wsptr[7*3] = (int) RIGHT_SHIFT(tmp13, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 7 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 7; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + /* Add fudge factor here for final descale. */ + tmp13 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2)); + tmp13 = LEFT_SHIFT(tmp13, CONST_BITS); + + z1 = (JLONG) wsptr[2]; + z2 = (JLONG) wsptr[4]; + z3 = (JLONG) wsptr[6]; + + tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734)); /* c4 */ + tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123)); /* c6 */ + tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */ + tmp0 = z1 + z3; + z2 -= tmp0; + tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */ + tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536)); /* c2-c4-c6 */ + tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249)); /* c2+c4+c6 */ + tmp13 += MULTIPLY(z2, FIX(1.414213562)); /* c0 */ + + /* Odd part */ + + z1 = (JLONG) wsptr[1]; + z2 = (JLONG) wsptr[3]; + z3 = (JLONG) wsptr[5]; + + tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347)); /* (c3+c1-c5)/2 */ + tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339)); /* (c3+c5-c1)/2 */ + tmp0 = tmp1 - tmp2; + tmp1 += tmp2; + tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276)); /* -c1 */ + tmp1 += tmp2; + z2 = MULTIPLY(z1 + z3, FIX(0.613604268)); /* c5 */ + tmp0 += z2; + tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693)); /* c3+c1-c5 */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 7; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a reduced-size 6x6 output block. + * + * Optimized algorithm with 3 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/12). + */ + +GLOBAL(void) +jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + JLONG tmp0, tmp1, tmp2, tmp10, tmp11, tmp12; + JLONG z1, z2, z3; + JCOEFPTR inptr; + ISLOW_MULT_TYPE *quantptr; + int *wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[6*6]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp0 = LEFT_SHIFT(tmp0, CONST_BITS); + /* Add fudge factor here for final descale. */ + tmp0 += ONE << (CONST_BITS-PASS1_BITS-1); + tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + tmp10 = MULTIPLY(tmp2, FIX(0.707106781)); /* c4 */ + tmp1 = tmp0 + tmp10; + tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS); + tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + tmp0 = MULTIPLY(tmp10, FIX(1.224744871)); /* c2 */ + tmp10 = tmp1 + tmp0; + tmp12 = tmp1 - tmp0; + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */ + tmp0 = tmp1 + LEFT_SHIFT(z1 + z2, CONST_BITS); + tmp2 = tmp1 + LEFT_SHIFT(z3 - z2, CONST_BITS); + tmp1 = LEFT_SHIFT(z1 - z2 - z3, PASS1_BITS); + + /* Final output stage */ + + wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS); + wsptr[6*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS); + wsptr[6*1] = (int) (tmp11 + tmp1); + wsptr[6*4] = (int) (tmp11 - tmp1); + wsptr[6*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS); + wsptr[6*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 6 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 6; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + /* Add fudge factor here for final descale. */ + tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2)); + tmp0 = LEFT_SHIFT(tmp0, CONST_BITS); + tmp2 = (JLONG) wsptr[4]; + tmp10 = MULTIPLY(tmp2, FIX(0.707106781)); /* c4 */ + tmp1 = tmp0 + tmp10; + tmp11 = tmp0 - tmp10 - tmp10; + tmp10 = (JLONG) wsptr[2]; + tmp0 = MULTIPLY(tmp10, FIX(1.224744871)); /* c2 */ + tmp10 = tmp1 + tmp0; + tmp12 = tmp1 - tmp0; + + /* Odd part */ + + z1 = (JLONG) wsptr[1]; + z2 = (JLONG) wsptr[3]; + z3 = (JLONG) wsptr[5]; + tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */ + tmp0 = tmp1 + LEFT_SHIFT(z1 + z2, CONST_BITS); + tmp2 = tmp1 + LEFT_SHIFT(z3 - z2, CONST_BITS); + tmp1 = LEFT_SHIFT(z1 - z2 - z3, CONST_BITS); + + /* Final output stage */ + + outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 6; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a reduced-size 5x5 output block. + * + * Optimized algorithm with 5 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/10). + */ + +GLOBAL(void) +jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + JLONG tmp0, tmp1, tmp10, tmp11, tmp12; + JLONG z1, z2, z3; + JCOEFPTR inptr; + ISLOW_MULT_TYPE *quantptr; + int *wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[5*5]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp12 = LEFT_SHIFT(tmp12, CONST_BITS); + /* Add fudge factor here for final descale. */ + tmp12 += ONE << (CONST_BITS-PASS1_BITS-1); + tmp0 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */ + z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */ + z3 = tmp12 + z2; + tmp10 = z3 + z1; + tmp11 = z3 - z1; + tmp12 -= LEFT_SHIFT(z2, 2); + + /* Odd part */ + + z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + + z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c3 */ + tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c1-c3 */ + tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c1+c3 */ + + /* Final output stage */ + + wsptr[5*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS); + wsptr[5*4] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS); + wsptr[5*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS); + wsptr[5*3] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS); + wsptr[5*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 5 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 5; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + /* Add fudge factor here for final descale. */ + tmp12 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2)); + tmp12 = LEFT_SHIFT(tmp12, CONST_BITS); + tmp0 = (JLONG) wsptr[2]; + tmp1 = (JLONG) wsptr[4]; + z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */ + z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */ + z3 = tmp12 + z2; + tmp10 = z3 + z1; + tmp11 = z3 - z1; + tmp12 -= LEFT_SHIFT(z2, 2); + + /* Odd part */ + + z2 = (JLONG) wsptr[1]; + z3 = (JLONG) wsptr[3]; + + z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c3 */ + tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c1-c3 */ + tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c1+c3 */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 5; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a reduced-size 3x3 output block. + * + * Optimized algorithm with 2 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/6). + */ + +GLOBAL(void) +jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + JLONG tmp0, tmp2, tmp10, tmp12; + JCOEFPTR inptr; + ISLOW_MULT_TYPE *quantptr; + int *wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[3*3]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp0 = LEFT_SHIFT(tmp0, CONST_BITS); + /* Add fudge factor here for final descale. */ + tmp0 += ONE << (CONST_BITS-PASS1_BITS-1); + tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */ + tmp10 = tmp0 + tmp12; + tmp2 = tmp0 - tmp12 - tmp12; + + /* Odd part */ + + tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */ + + /* Final output stage */ + + wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS); + wsptr[3*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS); + wsptr[3*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 3 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 3; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + /* Add fudge factor here for final descale. */ + tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2)); + tmp0 = LEFT_SHIFT(tmp0, CONST_BITS); + tmp2 = (JLONG) wsptr[2]; + tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */ + tmp10 = tmp0 + tmp12; + tmp2 = tmp0 - tmp12 - tmp12; + + /* Odd part */ + + tmp12 = (JLONG) wsptr[1]; + tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 3; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 9x9 output block. + * + * Optimized algorithm with 10 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/18). + */ + +GLOBAL(void) +jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + JLONG tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14; + JLONG z1, z2, z3, z4; + JCOEFPTR inptr; + ISLOW_MULT_TYPE *quantptr; + int *wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[8*9]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp0 = LEFT_SHIFT(tmp0, CONST_BITS); + /* Add fudge factor here for final descale. */ + tmp0 += ONE << (CONST_BITS-PASS1_BITS-1); + + z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + + tmp3 = MULTIPLY(z3, FIX(0.707106781)); /* c6 */ + tmp1 = tmp0 + tmp3; + tmp2 = tmp0 - tmp3 - tmp3; + + tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */ + tmp11 = tmp2 + tmp0; + tmp14 = tmp2 - tmp0 - tmp0; + + tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */ + tmp2 = MULTIPLY(z1, FIX(1.083350441)); /* c4 */ + tmp3 = MULTIPLY(z2, FIX(0.245575608)); /* c8 */ + + tmp10 = tmp1 + tmp0 - tmp3; + tmp12 = tmp1 - tmp0 + tmp2; + tmp13 = tmp1 - tmp2 + tmp3; + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + + z2 = MULTIPLY(z2, - FIX(1.224744871)); /* -c3 */ + + tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955)); /* c5 */ + tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525)); /* c7 */ + tmp0 = tmp2 + tmp3 - z2; + tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481)); /* c1 */ + tmp2 += z2 - tmp1; + tmp3 += z2 + tmp1; + tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */ + + /* Final output stage */ + + wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS); + wsptr[8*8] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS); + wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS); + wsptr[8*7] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS); + wsptr[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS); + wsptr[8*6] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS); + wsptr[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS-PASS1_BITS); + wsptr[8*5] = (int) RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS-PASS1_BITS); + wsptr[8*4] = (int) RIGHT_SHIFT(tmp14, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 9 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 9; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + /* Add fudge factor here for final descale. */ + tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2)); + tmp0 = LEFT_SHIFT(tmp0, CONST_BITS); + + z1 = (JLONG) wsptr[2]; + z2 = (JLONG) wsptr[4]; + z3 = (JLONG) wsptr[6]; + + tmp3 = MULTIPLY(z3, FIX(0.707106781)); /* c6 */ + tmp1 = tmp0 + tmp3; + tmp2 = tmp0 - tmp3 - tmp3; + + tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */ + tmp11 = tmp2 + tmp0; + tmp14 = tmp2 - tmp0 - tmp0; + + tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */ + tmp2 = MULTIPLY(z1, FIX(1.083350441)); /* c4 */ + tmp3 = MULTIPLY(z2, FIX(0.245575608)); /* c8 */ + + tmp10 = tmp1 + tmp0 - tmp3; + tmp12 = tmp1 - tmp0 + tmp2; + tmp13 = tmp1 - tmp2 + tmp3; + + /* Odd part */ + + z1 = (JLONG) wsptr[1]; + z2 = (JLONG) wsptr[3]; + z3 = (JLONG) wsptr[5]; + z4 = (JLONG) wsptr[7]; + + z2 = MULTIPLY(z2, - FIX(1.224744871)); /* -c3 */ + + tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955)); /* c5 */ + tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525)); /* c7 */ + tmp0 = tmp2 + tmp3 - z2; + tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481)); /* c1 */ + tmp2 += z2 - tmp1; + tmp3 += z2 + tmp1; + tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp3, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp3, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 8; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 10x10 output block. + * + * Optimized algorithm with 12 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/20). + */ + +GLOBAL(void) +jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + JLONG tmp10, tmp11, tmp12, tmp13, tmp14; + JLONG tmp20, tmp21, tmp22, tmp23, tmp24; + JLONG z1, z2, z3, z4, z5; + JCOEFPTR inptr; + ISLOW_MULT_TYPE *quantptr; + int *wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[8*10]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + z3 = LEFT_SHIFT(z3, CONST_BITS); + /* Add fudge factor here for final descale. */ + z3 += ONE << (CONST_BITS-PASS1_BITS-1); + z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + z1 = MULTIPLY(z4, FIX(1.144122806)); /* c4 */ + z2 = MULTIPLY(z4, FIX(0.437016024)); /* c8 */ + tmp10 = z3 + z1; + tmp11 = z3 - z2; + + tmp22 = RIGHT_SHIFT(z3 - LEFT_SHIFT(z1 - z2, 1), + CONST_BITS-PASS1_BITS); /* c0 = (c4-c8)*2 */ + + z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + + z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c6 */ + tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */ + tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */ + + tmp20 = tmp10 + tmp12; + tmp24 = tmp10 - tmp12; + tmp21 = tmp11 + tmp13; + tmp23 = tmp11 - tmp13; + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + + tmp11 = z2 + z4; + tmp13 = z2 - z4; + + tmp12 = MULTIPLY(tmp13, FIX(0.309016994)); /* (c3-c7)/2 */ + z5 = LEFT_SHIFT(z3, CONST_BITS); + + z2 = MULTIPLY(tmp11, FIX(0.951056516)); /* (c3+c7)/2 */ + z4 = z5 + tmp12; + + tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */ + tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */ + + z2 = MULTIPLY(tmp11, FIX(0.587785252)); /* (c1-c9)/2 */ + z4 = z5 - tmp12 - LEFT_SHIFT(tmp13, CONST_BITS - 1); + + tmp12 = LEFT_SHIFT(z1 - tmp13 - z3, PASS1_BITS); + + tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */ + tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */ + + /* Final output stage */ + + wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*2] = (int) (tmp22 + tmp12); + wsptr[8*7] = (int) (tmp22 - tmp12); + wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS); + wsptr[8*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS); + wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS); + wsptr[8*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 10 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 10; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + /* Add fudge factor here for final descale. */ + z3 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2)); + z3 = LEFT_SHIFT(z3, CONST_BITS); + z4 = (JLONG) wsptr[4]; + z1 = MULTIPLY(z4, FIX(1.144122806)); /* c4 */ + z2 = MULTIPLY(z4, FIX(0.437016024)); /* c8 */ + tmp10 = z3 + z1; + tmp11 = z3 - z2; + + tmp22 = z3 - LEFT_SHIFT(z1 - z2, 1); /* c0 = (c4-c8)*2 */ + + z2 = (JLONG) wsptr[2]; + z3 = (JLONG) wsptr[6]; + + z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c6 */ + tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */ + tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */ + + tmp20 = tmp10 + tmp12; + tmp24 = tmp10 - tmp12; + tmp21 = tmp11 + tmp13; + tmp23 = tmp11 - tmp13; + + /* Odd part */ + + z1 = (JLONG) wsptr[1]; + z2 = (JLONG) wsptr[3]; + z3 = (JLONG) wsptr[5]; + z3 = LEFT_SHIFT(z3, CONST_BITS); + z4 = (JLONG) wsptr[7]; + + tmp11 = z2 + z4; + tmp13 = z2 - z4; + + tmp12 = MULTIPLY(tmp13, FIX(0.309016994)); /* (c3-c7)/2 */ + + z2 = MULTIPLY(tmp11, FIX(0.951056516)); /* (c3+c7)/2 */ + z4 = z3 + tmp12; + + tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */ + tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */ + + z2 = MULTIPLY(tmp11, FIX(0.587785252)); /* (c1-c9)/2 */ + z4 = z3 - tmp12 - LEFT_SHIFT(tmp13, CONST_BITS - 1); + + tmp12 = LEFT_SHIFT(z1 - tmp13, CONST_BITS) - z3; + + tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */ + tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 8; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 11x11 output block. + * + * Optimized algorithm with 24 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/22). + */ + +GLOBAL(void) +jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + JLONG tmp10, tmp11, tmp12, tmp13, tmp14; + JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25; + JLONG z1, z2, z3, z4; + JCOEFPTR inptr; + ISLOW_MULT_TYPE *quantptr; + int *wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[8*11]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp10 = LEFT_SHIFT(tmp10, CONST_BITS); + /* Add fudge factor here for final descale. */ + tmp10 += ONE << (CONST_BITS-PASS1_BITS-1); + + z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + + tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132)); /* c2+c4 */ + tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045)); /* c2-c6 */ + z4 = z1 + z3; + tmp24 = MULTIPLY(z4, - FIX(1.155664402)); /* -(c2-c10) */ + z4 -= z2; + tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976)); /* c2 */ + tmp21 = tmp20 + tmp23 + tmp25 - + MULTIPLY(z2, FIX(1.821790775)); /* c2+c4+c10-c6 */ + tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */ + tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */ + tmp24 += tmp25; + tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120)); /* c8+c10 */ + tmp24 += MULTIPLY(z2, FIX(1.944413522)) - /* c2+c8 */ + MULTIPLY(z1, FIX(1.390975730)); /* c4+c10 */ + tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562)); /* c0 */ + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + + tmp11 = z1 + z2; + tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */ + tmp11 = MULTIPLY(tmp11, FIX(0.887983902)); /* c3-c9 */ + tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295)); /* c5-c9 */ + tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */ + tmp10 = tmp11 + tmp12 + tmp13 - + MULTIPLY(z1, FIX(0.923107866)); /* c7+c5+c3-c1-2*c9 */ + z1 = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */ + tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588)); /* c1+c7+3*c9-c3 */ + tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623)); /* c3+c5-c7-c9 */ + z1 = MULTIPLY(z2 + z4, - FIX(1.798248910)); /* -(c1+c9) */ + tmp11 += z1; + tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632)); /* c1+c5+c9-c7 */ + tmp14 += MULTIPLY(z2, - FIX(1.467221301)) + /* -(c5+c9) */ + MULTIPLY(z3, FIX(1.001388905)) - /* c1-c9 */ + MULTIPLY(z4, FIX(1.684843907)); /* c3+c9 */ + + /* Final output stage */ + + wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*10] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*9] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*8] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS); + wsptr[8*7] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS); + wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS); + wsptr[8*6] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS); + wsptr[8*5] = (int) RIGHT_SHIFT(tmp25, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 11 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 11; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + /* Add fudge factor here for final descale. */ + tmp10 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2)); + tmp10 = LEFT_SHIFT(tmp10, CONST_BITS); + + z1 = (JLONG) wsptr[2]; + z2 = (JLONG) wsptr[4]; + z3 = (JLONG) wsptr[6]; + + tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132)); /* c2+c4 */ + tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045)); /* c2-c6 */ + z4 = z1 + z3; + tmp24 = MULTIPLY(z4, - FIX(1.155664402)); /* -(c2-c10) */ + z4 -= z2; + tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976)); /* c2 */ + tmp21 = tmp20 + tmp23 + tmp25 - + MULTIPLY(z2, FIX(1.821790775)); /* c2+c4+c10-c6 */ + tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */ + tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */ + tmp24 += tmp25; + tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120)); /* c8+c10 */ + tmp24 += MULTIPLY(z2, FIX(1.944413522)) - /* c2+c8 */ + MULTIPLY(z1, FIX(1.390975730)); /* c4+c10 */ + tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562)); /* c0 */ + + /* Odd part */ + + z1 = (JLONG) wsptr[1]; + z2 = (JLONG) wsptr[3]; + z3 = (JLONG) wsptr[5]; + z4 = (JLONG) wsptr[7]; + + tmp11 = z1 + z2; + tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */ + tmp11 = MULTIPLY(tmp11, FIX(0.887983902)); /* c3-c9 */ + tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295)); /* c5-c9 */ + tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */ + tmp10 = tmp11 + tmp12 + tmp13 - + MULTIPLY(z1, FIX(0.923107866)); /* c7+c5+c3-c1-2*c9 */ + z1 = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */ + tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588)); /* c1+c7+3*c9-c3 */ + tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623)); /* c3+c5-c7-c9 */ + z1 = MULTIPLY(z2 + z4, - FIX(1.798248910)); /* -(c1+c9) */ + tmp11 += z1; + tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632)); /* c1+c5+c9-c7 */ + tmp14 += MULTIPLY(z2, - FIX(1.467221301)) + /* -(c5+c9) */ + MULTIPLY(z3, FIX(1.001388905)) - /* c1-c9 */ + MULTIPLY(z4, FIX(1.684843907)); /* c3+c9 */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 8; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 12x12 output block. + * + * Optimized algorithm with 15 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/24). + */ + +GLOBAL(void) +jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25; + JLONG z1, z2, z3, z4; + JCOEFPTR inptr; + ISLOW_MULT_TYPE *quantptr; + int *wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[8*12]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + z3 = LEFT_SHIFT(z3, CONST_BITS); + /* Add fudge factor here for final descale. */ + z3 += ONE << (CONST_BITS-PASS1_BITS-1); + + z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */ + + tmp10 = z3 + z4; + tmp11 = z3 - z4; + + z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */ + z1 = LEFT_SHIFT(z1, CONST_BITS); + z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + z2 = LEFT_SHIFT(z2, CONST_BITS); + + tmp12 = z1 - z2; + + tmp21 = z3 + tmp12; + tmp24 = z3 - tmp12; + + tmp12 = z4 + z2; + + tmp20 = tmp10 + tmp12; + tmp25 = tmp10 - tmp12; + + tmp12 = z4 - z1 - z2; + + tmp22 = tmp11 + tmp12; + tmp23 = tmp11 - tmp12; + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + + tmp11 = MULTIPLY(z2, FIX(1.306562965)); /* c3 */ + tmp14 = MULTIPLY(z2, - FIX_0_541196100); /* -c9 */ + + tmp10 = z1 + z3; + tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669)); /* c7 */ + tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384)); /* c5-c7 */ + tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716)); /* c1-c5 */ + tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580)); /* -(c7+c11) */ + tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */ + tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */ + tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) - /* c7-c11 */ + MULTIPLY(z4, FIX(1.982889723)); /* c5+c7 */ + + z1 -= z4; + z2 -= z3; + z3 = MULTIPLY(z1 + z2, FIX_0_541196100); /* c9 */ + tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865); /* c3-c9 */ + tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065); /* c3+c9 */ + + /* Final output stage */ + + wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*9] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS); + wsptr[8*8] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS); + wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS); + wsptr[8*7] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS); + wsptr[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS); + wsptr[8*6] = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 12 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 12; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + /* Add fudge factor here for final descale. */ + z3 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2)); + z3 = LEFT_SHIFT(z3, CONST_BITS); + + z4 = (JLONG) wsptr[4]; + z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */ + + tmp10 = z3 + z4; + tmp11 = z3 - z4; + + z1 = (JLONG) wsptr[2]; + z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */ + z1 = LEFT_SHIFT(z1, CONST_BITS); + z2 = (JLONG) wsptr[6]; + z2 = LEFT_SHIFT(z2, CONST_BITS); + + tmp12 = z1 - z2; + + tmp21 = z3 + tmp12; + tmp24 = z3 - tmp12; + + tmp12 = z4 + z2; + + tmp20 = tmp10 + tmp12; + tmp25 = tmp10 - tmp12; + + tmp12 = z4 - z1 - z2; + + tmp22 = tmp11 + tmp12; + tmp23 = tmp11 - tmp12; + + /* Odd part */ + + z1 = (JLONG) wsptr[1]; + z2 = (JLONG) wsptr[3]; + z3 = (JLONG) wsptr[5]; + z4 = (JLONG) wsptr[7]; + + tmp11 = MULTIPLY(z2, FIX(1.306562965)); /* c3 */ + tmp14 = MULTIPLY(z2, - FIX_0_541196100); /* -c9 */ + + tmp10 = z1 + z3; + tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669)); /* c7 */ + tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384)); /* c5-c7 */ + tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716)); /* c1-c5 */ + tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580)); /* -(c7+c11) */ + tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */ + tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */ + tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) - /* c7-c11 */ + MULTIPLY(z4, FIX(1.982889723)); /* c5+c7 */ + + z1 -= z4; + z2 -= z3; + z3 = MULTIPLY(z1 + z2, FIX_0_541196100); /* c9 */ + tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865); /* c3-c9 */ + tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065); /* c3+c9 */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 8; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 13x13 output block. + * + * Optimized algorithm with 29 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/26). + */ + +GLOBAL(void) +jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26; + JLONG z1, z2, z3, z4; + JCOEFPTR inptr; + ISLOW_MULT_TYPE *quantptr; + int *wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[8*13]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + z1 = LEFT_SHIFT(z1, CONST_BITS); + /* Add fudge factor here for final descale. */ + z1 += ONE << (CONST_BITS-PASS1_BITS-1); + + z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + + tmp10 = z3 + z4; + tmp11 = z3 - z4; + + tmp12 = MULTIPLY(tmp10, FIX(1.155388986)); /* (c4+c6)/2 */ + tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1; /* (c4-c6)/2 */ + + tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13; /* c2 */ + tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13; /* c10 */ + + tmp12 = MULTIPLY(tmp10, FIX(0.316450131)); /* (c8-c12)/2 */ + tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1; /* (c8+c12)/2 */ + + tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13; /* c6 */ + tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */ + + tmp12 = MULTIPLY(tmp10, FIX(0.435816023)); /* (c2-c10)/2 */ + tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1; /* (c2+c10)/2 */ + + tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */ + tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */ + + tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1; /* c0 */ + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + + tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651)); /* c3 */ + tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945)); /* c5 */ + tmp15 = z1 + z4; + tmp13 = MULTIPLY(tmp15, FIX(0.937797057)); /* c7 */ + tmp10 = tmp11 + tmp12 + tmp13 - + MULTIPLY(z1, FIX(2.020082300)); /* c7+c5+c3-c1 */ + tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458)); /* -c11 */ + tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */ + tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */ + tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945)); /* -c5 */ + tmp11 += tmp14; + tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */ + tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813)); /* -c9 */ + tmp12 += tmp14; + tmp13 += tmp14; + tmp15 = MULTIPLY(tmp15, FIX(0.338443458)); /* c11 */ + tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */ + MULTIPLY(z2, FIX(0.466105296)); /* c1-c7 */ + z1 = MULTIPLY(z3 - z2, FIX(0.937797057)); /* c7 */ + tmp14 += z1; + tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) - /* c3-c7 */ + MULTIPLY(z4, FIX(1.742345811)); /* c1+c11 */ + + /* Final output stage */ + + wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*12] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*11] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*10] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS); + wsptr[8*9] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS); + wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS); + wsptr[8*8] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS); + wsptr[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS); + wsptr[8*7] = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS); + wsptr[8*6] = (int) RIGHT_SHIFT(tmp26, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 13 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 13; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + /* Add fudge factor here for final descale. */ + z1 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2)); + z1 = LEFT_SHIFT(z1, CONST_BITS); + + z2 = (JLONG) wsptr[2]; + z3 = (JLONG) wsptr[4]; + z4 = (JLONG) wsptr[6]; + + tmp10 = z3 + z4; + tmp11 = z3 - z4; + + tmp12 = MULTIPLY(tmp10, FIX(1.155388986)); /* (c4+c6)/2 */ + tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1; /* (c4-c6)/2 */ + + tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13; /* c2 */ + tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13; /* c10 */ + + tmp12 = MULTIPLY(tmp10, FIX(0.316450131)); /* (c8-c12)/2 */ + tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1; /* (c8+c12)/2 */ + + tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13; /* c6 */ + tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */ + + tmp12 = MULTIPLY(tmp10, FIX(0.435816023)); /* (c2-c10)/2 */ + tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1; /* (c2+c10)/2 */ + + tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */ + tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */ + + tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1; /* c0 */ + + /* Odd part */ + + z1 = (JLONG) wsptr[1]; + z2 = (JLONG) wsptr[3]; + z3 = (JLONG) wsptr[5]; + z4 = (JLONG) wsptr[7]; + + tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651)); /* c3 */ + tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945)); /* c5 */ + tmp15 = z1 + z4; + tmp13 = MULTIPLY(tmp15, FIX(0.937797057)); /* c7 */ + tmp10 = tmp11 + tmp12 + tmp13 - + MULTIPLY(z1, FIX(2.020082300)); /* c7+c5+c3-c1 */ + tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458)); /* -c11 */ + tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */ + tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */ + tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945)); /* -c5 */ + tmp11 += tmp14; + tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */ + tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813)); /* -c9 */ + tmp12 += tmp14; + tmp13 += tmp14; + tmp15 = MULTIPLY(tmp15, FIX(0.338443458)); /* c11 */ + tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */ + MULTIPLY(z2, FIX(0.466105296)); /* c1-c7 */ + z1 = MULTIPLY(z3 - z2, FIX(0.937797057)); /* c7 */ + tmp14 += z1; + tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) - /* c3-c7 */ + MULTIPLY(z4, FIX(1.742345811)); /* c1+c11 */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp26, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 8; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 14x14 output block. + * + * Optimized algorithm with 20 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/28). + */ + +GLOBAL(void) +jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; + JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26; + JLONG z1, z2, z3, z4; + JCOEFPTR inptr; + ISLOW_MULT_TYPE *quantptr; + int *wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[8*14]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + z1 = LEFT_SHIFT(z1, CONST_BITS); + /* Add fudge factor here for final descale. */ + z1 += ONE << (CONST_BITS-PASS1_BITS-1); + z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + z2 = MULTIPLY(z4, FIX(1.274162392)); /* c4 */ + z3 = MULTIPLY(z4, FIX(0.314692123)); /* c12 */ + z4 = MULTIPLY(z4, FIX(0.881747734)); /* c8 */ + + tmp10 = z1 + z2; + tmp11 = z1 + z3; + tmp12 = z1 - z4; + + tmp23 = RIGHT_SHIFT(z1 - LEFT_SHIFT(z2 + z3 - z4, 1), + CONST_BITS-PASS1_BITS); /* c0 = (c4+c12-c8)*2 */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + + z3 = MULTIPLY(z1 + z2, FIX(1.105676686)); /* c6 */ + + tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */ + tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */ + tmp15 = MULTIPLY(z1, FIX(0.613604268)) - /* c10 */ + MULTIPLY(z2, FIX(1.378756276)); /* c2 */ + + tmp20 = tmp10 + tmp13; + tmp26 = tmp10 - tmp13; + tmp21 = tmp11 + tmp14; + tmp25 = tmp11 - tmp14; + tmp22 = tmp12 + tmp15; + tmp24 = tmp12 - tmp15; + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + tmp13 = LEFT_SHIFT(z4, CONST_BITS); + + tmp14 = z1 + z3; + tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607)); /* c3 */ + tmp12 = MULTIPLY(tmp14, FIX(1.197448846)); /* c5 */ + tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */ + tmp14 = MULTIPLY(tmp14, FIX(0.752406978)); /* c9 */ + tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426)); /* c9+c11-c13 */ + z1 -= z2; + tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13; /* c11 */ + tmp16 += tmp15; + z1 += z4; + z4 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */ + tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948)); /* c3-c9-c13 */ + tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773)); /* c3+c5-c13 */ + z4 = MULTIPLY(z3 - z2, FIX(1.405321284)); /* c1 */ + tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */ + tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567)); /* c1+c11-c5 */ + + tmp13 = LEFT_SHIFT(z1 - z3, PASS1_BITS); + + /* Final output stage */ + + wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*3] = (int) (tmp23 + tmp13); + wsptr[8*10] = (int) (tmp23 - tmp13); + wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS); + wsptr[8*9] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS); + wsptr[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS); + wsptr[8*8] = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS); + wsptr[8*6] = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS); + wsptr[8*7] = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 14 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 14; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + /* Add fudge factor here for final descale. */ + z1 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2)); + z1 = LEFT_SHIFT(z1, CONST_BITS); + z4 = (JLONG) wsptr[4]; + z2 = MULTIPLY(z4, FIX(1.274162392)); /* c4 */ + z3 = MULTIPLY(z4, FIX(0.314692123)); /* c12 */ + z4 = MULTIPLY(z4, FIX(0.881747734)); /* c8 */ + + tmp10 = z1 + z2; + tmp11 = z1 + z3; + tmp12 = z1 - z4; + + tmp23 = z1 - LEFT_SHIFT(z2 + z3 - z4, 1); /* c0 = (c4+c12-c8)*2 */ + + z1 = (JLONG) wsptr[2]; + z2 = (JLONG) wsptr[6]; + + z3 = MULTIPLY(z1 + z2, FIX(1.105676686)); /* c6 */ + + tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */ + tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */ + tmp15 = MULTIPLY(z1, FIX(0.613604268)) - /* c10 */ + MULTIPLY(z2, FIX(1.378756276)); /* c2 */ + + tmp20 = tmp10 + tmp13; + tmp26 = tmp10 - tmp13; + tmp21 = tmp11 + tmp14; + tmp25 = tmp11 - tmp14; + tmp22 = tmp12 + tmp15; + tmp24 = tmp12 - tmp15; + + /* Odd part */ + + z1 = (JLONG) wsptr[1]; + z2 = (JLONG) wsptr[3]; + z3 = (JLONG) wsptr[5]; + z4 = (JLONG) wsptr[7]; + z4 = LEFT_SHIFT(z4, CONST_BITS); + + tmp14 = z1 + z3; + tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607)); /* c3 */ + tmp12 = MULTIPLY(tmp14, FIX(1.197448846)); /* c5 */ + tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */ + tmp14 = MULTIPLY(tmp14, FIX(0.752406978)); /* c9 */ + tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426)); /* c9+c11-c13 */ + z1 -= z2; + tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4; /* c11 */ + tmp16 += tmp15; + tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4; /* -c13 */ + tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948)); /* c3-c9-c13 */ + tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773)); /* c3+c5-c13 */ + tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284)); /* c1 */ + tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */ + tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567)); /* c1+c11-c5 */ + + tmp13 = LEFT_SHIFT(z1 - z3, CONST_BITS) + z4; + + /* Final output stage */ + + outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 8; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 15x15 output block. + * + * Optimized algorithm with 22 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/30). + */ + +GLOBAL(void) +jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; + JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27; + JLONG z1, z2, z3, z4; + JCOEFPTR inptr; + ISLOW_MULT_TYPE *quantptr; + int *wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[8*15]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + z1 = LEFT_SHIFT(z1, CONST_BITS); + /* Add fudge factor here for final descale. */ + z1 += ONE << (CONST_BITS-PASS1_BITS-1); + + z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + + tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */ + tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */ + + tmp12 = z1 - tmp10; + tmp13 = z1 + tmp11; + z1 -= LEFT_SHIFT(tmp11 - tmp10, 1); /* c0 = (c6-c12)*2 */ + + z4 = z2 - z3; + z3 += z2; + tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */ + tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */ + z2 = MULTIPLY(z2, FIX(1.439773946)); /* c4+c14 */ + + tmp20 = tmp13 + tmp10 + tmp11; + tmp23 = tmp12 - tmp10 + tmp11 + z2; + + tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */ + tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */ + + tmp25 = tmp13 - tmp10 - tmp11; + tmp26 = tmp12 + tmp10 - tmp11 - z2; + + tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */ + tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */ + + tmp21 = tmp12 + tmp10 + tmp11; + tmp24 = tmp13 - tmp10 + tmp11; + tmp11 += tmp11; + tmp22 = z1 + tmp11; /* c10 = c6-c12 */ + tmp27 = z1 - tmp11 - tmp11; /* c0 = (c6-c12)*2 */ + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z4 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + z3 = MULTIPLY(z4, FIX(1.224744871)); /* c5 */ + z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + + tmp13 = z2 - z4; + tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876)); /* c9 */ + tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148)); /* c3-c9 */ + tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899)); /* c3+c9 */ + + tmp13 = MULTIPLY(z2, - FIX(0.831253876)); /* -c9 */ + tmp15 = MULTIPLY(z2, - FIX(1.344997024)); /* -c3 */ + z2 = z1 - z4; + tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353)); /* c1 */ + + tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */ + tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */ + tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3; /* c5 */ + z2 = MULTIPLY(z1 + z4, FIX(0.575212477)); /* c11 */ + tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3; /* c7-c11 */ + tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3; /* c11+c13 */ + + /* Final output stage */ + + wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*14] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*13] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*12] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS); + wsptr[8*11] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS); + wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS); + wsptr[8*10] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS); + wsptr[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS); + wsptr[8*9] = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS); + wsptr[8*6] = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS); + wsptr[8*8] = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS); + wsptr[8*7] = (int) RIGHT_SHIFT(tmp27, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 15 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 15; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + /* Add fudge factor here for final descale. */ + z1 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2)); + z1 = LEFT_SHIFT(z1, CONST_BITS); + + z2 = (JLONG) wsptr[2]; + z3 = (JLONG) wsptr[4]; + z4 = (JLONG) wsptr[6]; + + tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */ + tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */ + + tmp12 = z1 - tmp10; + tmp13 = z1 + tmp11; + z1 -= LEFT_SHIFT(tmp11 - tmp10, 1); /* c0 = (c6-c12)*2 */ + + z4 = z2 - z3; + z3 += z2; + tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */ + tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */ + z2 = MULTIPLY(z2, FIX(1.439773946)); /* c4+c14 */ + + tmp20 = tmp13 + tmp10 + tmp11; + tmp23 = tmp12 - tmp10 + tmp11 + z2; + + tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */ + tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */ + + tmp25 = tmp13 - tmp10 - tmp11; + tmp26 = tmp12 + tmp10 - tmp11 - z2; + + tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */ + tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */ + + tmp21 = tmp12 + tmp10 + tmp11; + tmp24 = tmp13 - tmp10 + tmp11; + tmp11 += tmp11; + tmp22 = z1 + tmp11; /* c10 = c6-c12 */ + tmp27 = z1 - tmp11 - tmp11; /* c0 = (c6-c12)*2 */ + + /* Odd part */ + + z1 = (JLONG) wsptr[1]; + z2 = (JLONG) wsptr[3]; + z4 = (JLONG) wsptr[5]; + z3 = MULTIPLY(z4, FIX(1.224744871)); /* c5 */ + z4 = (JLONG) wsptr[7]; + + tmp13 = z2 - z4; + tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876)); /* c9 */ + tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148)); /* c3-c9 */ + tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899)); /* c3+c9 */ + + tmp13 = MULTIPLY(z2, - FIX(0.831253876)); /* -c9 */ + tmp15 = MULTIPLY(z2, - FIX(1.344997024)); /* -c3 */ + z2 = z1 - z4; + tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353)); /* c1 */ + + tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */ + tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */ + tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3; /* c5 */ + z2 = MULTIPLY(z1 + z4, FIX(0.575212477)); /* c11 */ + tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3; /* c7-c11 */ + tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3; /* c11+c13 */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp27, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 8; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 16x16 output block. + * + * Optimized algorithm with 28 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/32). + */ + +GLOBAL(void) +jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + JLONG tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13; + JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27; + JLONG z1, z2, z3, z4; + JCOEFPTR inptr; + ISLOW_MULT_TYPE *quantptr; + int *wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[8*16]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp0 = LEFT_SHIFT(tmp0, CONST_BITS); + /* Add fudge factor here for final descale. */ + tmp0 += 1 << (CONST_BITS-PASS1_BITS-1); + + z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + tmp1 = MULTIPLY(z1, FIX(1.306562965)); /* c4[16] = c2[8] */ + tmp2 = MULTIPLY(z1, FIX_0_541196100); /* c12[16] = c6[8] */ + + tmp10 = tmp0 + tmp1; + tmp11 = tmp0 - tmp1; + tmp12 = tmp0 + tmp2; + tmp13 = tmp0 - tmp2; + + z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + z3 = z1 - z2; + z4 = MULTIPLY(z3, FIX(0.275899379)); /* c14[16] = c7[8] */ + z3 = MULTIPLY(z3, FIX(1.387039845)); /* c2[16] = c1[8] */ + + tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447); /* (c6+c2)[16] = (c3+c1)[8] */ + tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223); /* (c6-c14)[16] = (c3-c7)[8] */ + tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */ + tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */ + + tmp20 = tmp10 + tmp0; + tmp27 = tmp10 - tmp0; + tmp21 = tmp12 + tmp1; + tmp26 = tmp12 - tmp1; + tmp22 = tmp13 + tmp2; + tmp25 = tmp13 - tmp2; + tmp23 = tmp11 + tmp3; + tmp24 = tmp11 - tmp3; + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + + tmp11 = z1 + z3; + + tmp1 = MULTIPLY(z1 + z2, FIX(1.353318001)); /* c3 */ + tmp2 = MULTIPLY(tmp11, FIX(1.247225013)); /* c5 */ + tmp3 = MULTIPLY(z1 + z4, FIX(1.093201867)); /* c7 */ + tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586)); /* c9 */ + tmp11 = MULTIPLY(tmp11, FIX(0.666655658)); /* c11 */ + tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528)); /* c13 */ + tmp0 = tmp1 + tmp2 + tmp3 - + MULTIPLY(z1, FIX(2.286341144)); /* c7+c5+c3-c1 */ + tmp13 = tmp10 + tmp11 + tmp12 - + MULTIPLY(z1, FIX(1.835730603)); /* c9+c11+c13-c15 */ + z1 = MULTIPLY(z2 + z3, FIX(0.138617169)); /* c15 */ + tmp1 += z1 + MULTIPLY(z2, FIX(0.071888074)); /* c9+c11-c3-c15 */ + tmp2 += z1 - MULTIPLY(z3, FIX(1.125726048)); /* c5+c7+c15-c3 */ + z1 = MULTIPLY(z3 - z2, FIX(1.407403738)); /* c1 */ + tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282)); /* c1+c11-c9-c13 */ + tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411)); /* c1+c5+c13-c7 */ + z2 += z4; + z1 = MULTIPLY(z2, - FIX(0.666655658)); /* -c11 */ + tmp1 += z1; + tmp3 += z1 + MULTIPLY(z4, FIX(1.065388962)); /* c3+c11+c15-c7 */ + z2 = MULTIPLY(z2, - FIX(1.247225013)); /* -c5 */ + tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809)); /* c1+c5+c9-c13 */ + tmp12 += z2; + z2 = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */ + tmp2 += z2; + tmp3 += z2; + z2 = MULTIPLY(z4 - z3, FIX(0.410524528)); /* c13 */ + tmp10 += z2; + tmp11 += z2; + + /* Final output stage */ + + wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp0, CONST_BITS-PASS1_BITS); + wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0, CONST_BITS-PASS1_BITS); + wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp1, CONST_BITS-PASS1_BITS); + wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1, CONST_BITS-PASS1_BITS); + wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp2, CONST_BITS-PASS1_BITS); + wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2, CONST_BITS-PASS1_BITS); + wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp3, CONST_BITS-PASS1_BITS); + wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3, CONST_BITS-PASS1_BITS); + wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*6] = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*9] = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*7] = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS); + wsptr[8*8] = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 16 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 16; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + /* Add fudge factor here for final descale. */ + tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2)); + tmp0 = LEFT_SHIFT(tmp0, CONST_BITS); + + z1 = (JLONG) wsptr[4]; + tmp1 = MULTIPLY(z1, FIX(1.306562965)); /* c4[16] = c2[8] */ + tmp2 = MULTIPLY(z1, FIX_0_541196100); /* c12[16] = c6[8] */ + + tmp10 = tmp0 + tmp1; + tmp11 = tmp0 - tmp1; + tmp12 = tmp0 + tmp2; + tmp13 = tmp0 - tmp2; + + z1 = (JLONG) wsptr[2]; + z2 = (JLONG) wsptr[6]; + z3 = z1 - z2; + z4 = MULTIPLY(z3, FIX(0.275899379)); /* c14[16] = c7[8] */ + z3 = MULTIPLY(z3, FIX(1.387039845)); /* c2[16] = c1[8] */ + + tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447); /* (c6+c2)[16] = (c3+c1)[8] */ + tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223); /* (c6-c14)[16] = (c3-c7)[8] */ + tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */ + tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */ + + tmp20 = tmp10 + tmp0; + tmp27 = tmp10 - tmp0; + tmp21 = tmp12 + tmp1; + tmp26 = tmp12 - tmp1; + tmp22 = tmp13 + tmp2; + tmp25 = tmp13 - tmp2; + tmp23 = tmp11 + tmp3; + tmp24 = tmp11 - tmp3; + + /* Odd part */ + + z1 = (JLONG) wsptr[1]; + z2 = (JLONG) wsptr[3]; + z3 = (JLONG) wsptr[5]; + z4 = (JLONG) wsptr[7]; + + tmp11 = z1 + z3; + + tmp1 = MULTIPLY(z1 + z2, FIX(1.353318001)); /* c3 */ + tmp2 = MULTIPLY(tmp11, FIX(1.247225013)); /* c5 */ + tmp3 = MULTIPLY(z1 + z4, FIX(1.093201867)); /* c7 */ + tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586)); /* c9 */ + tmp11 = MULTIPLY(tmp11, FIX(0.666655658)); /* c11 */ + tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528)); /* c13 */ + tmp0 = tmp1 + tmp2 + tmp3 - + MULTIPLY(z1, FIX(2.286341144)); /* c7+c5+c3-c1 */ + tmp13 = tmp10 + tmp11 + tmp12 - + MULTIPLY(z1, FIX(1.835730603)); /* c9+c11+c13-c15 */ + z1 = MULTIPLY(z2 + z3, FIX(0.138617169)); /* c15 */ + tmp1 += z1 + MULTIPLY(z2, FIX(0.071888074)); /* c9+c11-c3-c15 */ + tmp2 += z1 - MULTIPLY(z3, FIX(1.125726048)); /* c5+c7+c15-c3 */ + z1 = MULTIPLY(z3 - z2, FIX(1.407403738)); /* c1 */ + tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282)); /* c1+c11-c9-c13 */ + tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411)); /* c1+c5+c13-c7 */ + z2 += z4; + z1 = MULTIPLY(z2, - FIX(0.666655658)); /* -c11 */ + tmp1 += z1; + tmp3 += z1 + MULTIPLY(z4, FIX(1.065388962)); /* c3+c11+c15-c7 */ + z2 = MULTIPLY(z2, - FIX(1.247225013)); /* -c5 */ + tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809)); /* c1+c5+c9-c13 */ + tmp12 += z2; + z2 = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */ + tmp2 += z2; + tmp3 += z2; + z2 = MULTIPLY(z4 - z3, FIX(0.410524528)); /* c13 */ + tmp10 += z2; + tmp11 += z2; + + /* Final output stage */ + + outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 8; /* advance pointer to next row */ + } +} + +#endif /* IDCT_SCALING_SUPPORTED */ +#endif /* DCT_ISLOW_SUPPORTED */ diff --git a/media/libjpeg/jidctred.c b/media/libjpeg/jidctred.c new file mode 100644 index 000000000..7a81803b8 --- /dev/null +++ b/media/libjpeg/jidctred.c @@ -0,0 +1,403 @@ +/* + * jidctred.c + * + * This file was part of the Independent JPEG Group's software. + * Copyright (C) 1994-1998, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2015, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains inverse-DCT routines that produce reduced-size output: + * either 4x4, 2x2, or 1x1 pixels from an 8x8 DCT block. + * + * The implementation is based on the Loeffler, Ligtenberg and Moschytz (LL&M) + * algorithm used in jidctint.c. We simply replace each 8-to-8 1-D IDCT step + * with an 8-to-4 step that produces the four averages of two adjacent outputs + * (or an 8-to-2 step producing two averages of four outputs, for 2x2 output). + * These steps were derived by computing the corresponding values at the end + * of the normal LL&M code, then simplifying as much as possible. + * + * 1x1 is trivial: just take the DC coefficient divided by 8. + * + * See jidctint.c for additional comments. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jdct.h" /* Private declarations for DCT subsystem */ + +#ifdef IDCT_SCALING_SUPPORTED + + +/* + * This module is specialized to the case DCTSIZE = 8. + */ + +#if DCTSIZE != 8 + Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ +#endif + + +/* Scaling is the same as in jidctint.c. */ + +#if BITS_IN_JSAMPLE == 8 +#define CONST_BITS 13 +#define PASS1_BITS 2 +#else +#define CONST_BITS 13 +#define PASS1_BITS 1 /* lose a little precision to avoid overflow */ +#endif + +/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus + * causing a lot of useless floating-point operations at run time. + * To get around this we use the following pre-calculated constants. + * If you change CONST_BITS you may want to add appropriate values. + * (With a reasonable C compiler, you can just rely on the FIX() macro...) + */ + +#if CONST_BITS == 13 +#define FIX_0_211164243 ((JLONG) 1730) /* FIX(0.211164243) */ +#define FIX_0_509795579 ((JLONG) 4176) /* FIX(0.509795579) */ +#define FIX_0_601344887 ((JLONG) 4926) /* FIX(0.601344887) */ +#define FIX_0_720959822 ((JLONG) 5906) /* FIX(0.720959822) */ +#define FIX_0_765366865 ((JLONG) 6270) /* FIX(0.765366865) */ +#define FIX_0_850430095 ((JLONG) 6967) /* FIX(0.850430095) */ +#define FIX_0_899976223 ((JLONG) 7373) /* FIX(0.899976223) */ +#define FIX_1_061594337 ((JLONG) 8697) /* FIX(1.061594337) */ +#define FIX_1_272758580 ((JLONG) 10426) /* FIX(1.272758580) */ +#define FIX_1_451774981 ((JLONG) 11893) /* FIX(1.451774981) */ +#define FIX_1_847759065 ((JLONG) 15137) /* FIX(1.847759065) */ +#define FIX_2_172734803 ((JLONG) 17799) /* FIX(2.172734803) */ +#define FIX_2_562915447 ((JLONG) 20995) /* FIX(2.562915447) */ +#define FIX_3_624509785 ((JLONG) 29692) /* FIX(3.624509785) */ +#else +#define FIX_0_211164243 FIX(0.211164243) +#define FIX_0_509795579 FIX(0.509795579) +#define FIX_0_601344887 FIX(0.601344887) +#define FIX_0_720959822 FIX(0.720959822) +#define FIX_0_765366865 FIX(0.765366865) +#define FIX_0_850430095 FIX(0.850430095) +#define FIX_0_899976223 FIX(0.899976223) +#define FIX_1_061594337 FIX(1.061594337) +#define FIX_1_272758580 FIX(1.272758580) +#define FIX_1_451774981 FIX(1.451774981) +#define FIX_1_847759065 FIX(1.847759065) +#define FIX_2_172734803 FIX(2.172734803) +#define FIX_2_562915447 FIX(2.562915447) +#define FIX_3_624509785 FIX(3.624509785) +#endif + + +/* Multiply a JLONG variable by a JLONG constant to yield a JLONG result. + * For 8-bit samples with the recommended scaling, all the variable + * and constant values involved are no more than 16 bits wide, so a + * 16x16->32 bit multiply can be used instead of a full 32x32 multiply. + * For 12-bit samples, a full 32-bit multiplication will be needed. + */ + +#if BITS_IN_JSAMPLE == 8 +#define MULTIPLY(var,const) MULTIPLY16C16(var,const) +#else +#define MULTIPLY(var,const) ((var) * (const)) +#endif + + +/* Dequantize a coefficient by multiplying it by the multiplier-table + * entry; produce an int result. In this module, both inputs and result + * are 16 bits or less, so either int or short multiply will work. + */ + +#define DEQUANTIZE(coef,quantval) (((ISLOW_MULT_TYPE) (coef)) * (quantval)) + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a reduced-size 4x4 output block. + */ + +GLOBAL(void) +jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + JLONG tmp0, tmp2, tmp10, tmp12; + JLONG z1, z2, z3, z4; + JCOEFPTR inptr; + ISLOW_MULT_TYPE *quantptr; + int *wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[DCTSIZE*4]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = DCTSIZE; ctr > 0; inptr++, quantptr++, wsptr++, ctr--) { + /* Don't bother to process column 4, because second pass won't use it */ + if (ctr == DCTSIZE-4) + continue; + if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 && + inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*5] == 0 && + inptr[DCTSIZE*6] == 0 && inptr[DCTSIZE*7] == 0) { + /* AC terms all zero; we need not examine term 4 for 4x4 output */ + int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]), + PASS1_BITS); + + wsptr[DCTSIZE*0] = dcval; + wsptr[DCTSIZE*1] = dcval; + wsptr[DCTSIZE*2] = dcval; + wsptr[DCTSIZE*3] = dcval; + + continue; + } + + /* Even part */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp0 = LEFT_SHIFT(tmp0, CONST_BITS+1); + + z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + + tmp2 = MULTIPLY(z2, FIX_1_847759065) + MULTIPLY(z3, - FIX_0_765366865); + + tmp10 = tmp0 + tmp2; + tmp12 = tmp0 - tmp2; + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + z2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + + tmp0 = MULTIPLY(z1, - FIX_0_211164243) /* sqrt(2) * (c3-c1) */ + + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */ + + MULTIPLY(z3, - FIX_2_172734803) /* sqrt(2) * (-c1-c5) */ + + MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * (c5+c7) */ + + tmp2 = MULTIPLY(z1, - FIX_0_509795579) /* sqrt(2) * (c7-c5) */ + + MULTIPLY(z2, - FIX_0_601344887) /* sqrt(2) * (c5-c1) */ + + MULTIPLY(z3, FIX_0_899976223) /* sqrt(2) * (c3-c7) */ + + MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */ + + /* Final output stage */ + + wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp2, CONST_BITS-PASS1_BITS+1); + wsptr[DCTSIZE*3] = (int) DESCALE(tmp10 - tmp2, CONST_BITS-PASS1_BITS+1); + wsptr[DCTSIZE*1] = (int) DESCALE(tmp12 + tmp0, CONST_BITS-PASS1_BITS+1); + wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 - tmp0, CONST_BITS-PASS1_BITS+1); + } + + /* Pass 2: process 4 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 4; ctr++) { + outptr = output_buf[ctr] + output_col; + /* It's not clear whether a zero row test is worthwhile here ... */ + +#ifndef NO_ZERO_ROW_TEST + if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && + wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) { + /* AC terms all zero */ + JSAMPLE dcval = range_limit[(int) DESCALE((JLONG) wsptr[0], PASS1_BITS+3) + & RANGE_MASK]; + + outptr[0] = dcval; + outptr[1] = dcval; + outptr[2] = dcval; + outptr[3] = dcval; + + wsptr += DCTSIZE; /* advance pointer to next row */ + continue; + } +#endif + + /* Even part */ + + tmp0 = LEFT_SHIFT((JLONG) wsptr[0], CONST_BITS+1); + + tmp2 = MULTIPLY((JLONG) wsptr[2], FIX_1_847759065) + + MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865); + + tmp10 = tmp0 + tmp2; + tmp12 = tmp0 - tmp2; + + /* Odd part */ + + z1 = (JLONG) wsptr[7]; + z2 = (JLONG) wsptr[5]; + z3 = (JLONG) wsptr[3]; + z4 = (JLONG) wsptr[1]; + + tmp0 = MULTIPLY(z1, - FIX_0_211164243) /* sqrt(2) * (c3-c1) */ + + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */ + + MULTIPLY(z3, - FIX_2_172734803) /* sqrt(2) * (-c1-c5) */ + + MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * (c5+c7) */ + + tmp2 = MULTIPLY(z1, - FIX_0_509795579) /* sqrt(2) * (c7-c5) */ + + MULTIPLY(z2, - FIX_0_601344887) /* sqrt(2) * (c5-c1) */ + + MULTIPLY(z3, FIX_0_899976223) /* sqrt(2) * (c3-c7) */ + + MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp2, + CONST_BITS+PASS1_BITS+3+1) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp10 - tmp2, + CONST_BITS+PASS1_BITS+3+1) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp12 + tmp0, + CONST_BITS+PASS1_BITS+3+1) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp12 - tmp0, + CONST_BITS+PASS1_BITS+3+1) + & RANGE_MASK]; + + wsptr += DCTSIZE; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a reduced-size 2x2 output block. + */ + +GLOBAL(void) +jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + JLONG tmp0, tmp10, z1; + JCOEFPTR inptr; + ISLOW_MULT_TYPE *quantptr; + int *wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[DCTSIZE*2]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = DCTSIZE; ctr > 0; inptr++, quantptr++, wsptr++, ctr--) { + /* Don't bother to process columns 2,4,6 */ + if (ctr == DCTSIZE-2 || ctr == DCTSIZE-4 || ctr == DCTSIZE-6) + continue; + if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*3] == 0 && + inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*7] == 0) { + /* AC terms all zero; we need not examine terms 2,4,6 for 2x2 output */ + int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]), + PASS1_BITS); + + wsptr[DCTSIZE*0] = dcval; + wsptr[DCTSIZE*1] = dcval; + + continue; + } + + /* Even part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp10 = LEFT_SHIFT(z1, CONST_BITS+2); + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + tmp0 = MULTIPLY(z1, - FIX_0_720959822); /* sqrt(2) * (c7-c5+c3-c1) */ + z1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + tmp0 += MULTIPLY(z1, FIX_0_850430095); /* sqrt(2) * (-c1+c3+c5+c7) */ + z1 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + tmp0 += MULTIPLY(z1, - FIX_1_272758580); /* sqrt(2) * (-c1+c3-c5-c7) */ + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + tmp0 += MULTIPLY(z1, FIX_3_624509785); /* sqrt(2) * (c1+c3+c5+c7) */ + + /* Final output stage */ + + wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp0, CONST_BITS-PASS1_BITS+2); + wsptr[DCTSIZE*1] = (int) DESCALE(tmp10 - tmp0, CONST_BITS-PASS1_BITS+2); + } + + /* Pass 2: process 2 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 2; ctr++) { + outptr = output_buf[ctr] + output_col; + /* It's not clear whether a zero row test is worthwhile here ... */ + +#ifndef NO_ZERO_ROW_TEST + if (wsptr[1] == 0 && wsptr[3] == 0 && wsptr[5] == 0 && wsptr[7] == 0) { + /* AC terms all zero */ + JSAMPLE dcval = range_limit[(int) DESCALE((JLONG) wsptr[0], PASS1_BITS+3) + & RANGE_MASK]; + + outptr[0] = dcval; + outptr[1] = dcval; + + wsptr += DCTSIZE; /* advance pointer to next row */ + continue; + } +#endif + + /* Even part */ + + tmp10 = LEFT_SHIFT((JLONG) wsptr[0], CONST_BITS+2); + + /* Odd part */ + + tmp0 = MULTIPLY((JLONG) wsptr[7], - FIX_0_720959822) /* sqrt(2) * (c7-c5+c3-c1) */ + + MULTIPLY((JLONG) wsptr[5], FIX_0_850430095) /* sqrt(2) * (-c1+c3+c5+c7) */ + + MULTIPLY((JLONG) wsptr[3], - FIX_1_272758580) /* sqrt(2) * (-c1+c3-c5-c7) */ + + MULTIPLY((JLONG) wsptr[1], FIX_3_624509785); /* sqrt(2) * (c1+c3+c5+c7) */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp0, + CONST_BITS+PASS1_BITS+3+2) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp10 - tmp0, + CONST_BITS+PASS1_BITS+3+2) + & RANGE_MASK]; + + wsptr += DCTSIZE; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a reduced-size 1x1 output block. + */ + +GLOBAL(void) +jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + int dcval; + ISLOW_MULT_TYPE *quantptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + SHIFT_TEMPS + + /* We hardly need an inverse DCT routine for this: just take the + * average pixel value, which is one-eighth of the DC coefficient. + */ + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + dcval = DEQUANTIZE(coef_block[0], quantptr[0]); + dcval = (int) DESCALE((JLONG) dcval, 3); + + output_buf[0][output_col] = range_limit[dcval & RANGE_MASK]; +} + +#endif /* IDCT_SCALING_SUPPORTED */ diff --git a/media/libjpeg/jinclude.h b/media/libjpeg/jinclude.h new file mode 100644 index 000000000..d461a1aa1 --- /dev/null +++ b/media/libjpeg/jinclude.h @@ -0,0 +1,84 @@ +/* + * jinclude.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1994, Thomas G. Lane. + * It was modified by The libjpeg-turbo Project to include only code relevant + * to libjpeg-turbo. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file exists to provide a single place to fix any problems with + * including the wrong system include files. (Common problems are taken + * care of by the standard jconfig symbols, but on really weird systems + * you may have to edit this file.) + * + * NOTE: this file is NOT intended to be included by applications using the + * JPEG library. Most applications need only include jpeglib.h. + */ + + +/* Include auto-config file to find out which system include files we need. */ + +#include "jconfig.h" /* auto configuration options */ +#define JCONFIG_INCLUDED /* so that jpeglib.h doesn't do it again */ + +/* + * We need the NULL macro and size_t typedef. + * On an ANSI-conforming system it is sufficient to include <stddef.h>. + * Otherwise, we get them from <stdlib.h> or <stdio.h>; we may have to + * pull in <sys/types.h> as well. + * Note that the core JPEG library does not require <stdio.h>; + * only the default error handler and data source/destination modules do. + * But we must pull it in because of the references to FILE in jpeglib.h. + * You can remove those references if you want to compile without <stdio.h>. + */ + +#ifdef HAVE_STDDEF_H +#include <stddef.h> +#endif + +#ifdef HAVE_STDLIB_H +#include <stdlib.h> +#endif + +#ifdef NEED_SYS_TYPES_H +#include <sys/types.h> +#endif + +#include <stdio.h> + +/* + * We need memory copying and zeroing functions, plus strncpy(). + * ANSI and System V implementations declare these in <string.h>. + * BSD doesn't have the mem() functions, but it does have bcopy()/bzero(). + * Some systems may declare memset and memcpy in <memory.h>. + * + * NOTE: we assume the size parameters to these functions are of type size_t. + * Change the casts in these macros if not! + */ + +#ifdef NEED_BSD_STRINGS + +#include <strings.h> +#define MEMZERO(target,size) bzero((void *)(target), (size_t)(size)) +#define MEMCOPY(dest,src,size) bcopy((const void *)(src), (void *)(dest), (size_t)(size)) + +#else /* not BSD, assume ANSI/SysV string lib */ + +#include <string.h> +#define MEMZERO(target,size) memset((void *)(target), 0, (size_t)(size)) +#define MEMCOPY(dest,src,size) memcpy((void *)(dest), (const void *)(src), (size_t)(size)) + +#endif + +/* + * The modules that use fread() and fwrite() always invoke them through + * these macros. On some systems you may need to twiddle the argument casts. + * CAUTION: argument order is different from underlying functions! + */ + +#define JFREAD(file,buf,sizeofbuf) \ + ((size_t) fread((void *) (buf), (size_t) 1, (size_t) (sizeofbuf), (file))) +#define JFWRITE(file,buf,sizeofbuf) \ + ((size_t) fwrite((const void *) (buf), (size_t) 1, (size_t) (sizeofbuf), (file))) diff --git a/media/libjpeg/jmemmgr.c b/media/libjpeg/jmemmgr.c new file mode 100644 index 000000000..2a8d8401f --- /dev/null +++ b/media/libjpeg/jmemmgr.c @@ -0,0 +1,1181 @@ +/* + * jmemmgr.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1997, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2016, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains the JPEG system-independent memory management + * routines. This code is usable across a wide variety of machines; most + * of the system dependencies have been isolated in a separate file. + * The major functions provided here are: + * * pool-based allocation and freeing of memory; + * * policy decisions about how to divide available memory among the + * virtual arrays; + * * control logic for swapping virtual arrays between main memory and + * backing storage. + * The separate system-dependent file provides the actual backing-storage + * access code, and it contains the policy decision about how much total + * main memory to use. + * This file is system-dependent in the sense that some of its functions + * are unnecessary in some systems. For example, if there is enough virtual + * memory so that backing storage will never be used, much of the virtual + * array control logic could be removed. (Of course, if you have that much + * memory then you shouldn't care about a little bit of unused code...) + */ + +#define JPEG_INTERNALS +#define AM_MEMORY_MANAGER /* we define jvirt_Xarray_control structs */ +#include "jinclude.h" +#include "jpeglib.h" +#include "jmemsys.h" /* import the system-dependent declarations */ +#include <stdint.h> +#include <limits.h> /* some NDKs define SIZE_MAX in limits.h */ + +#ifndef NO_GETENV +#ifndef HAVE_STDLIB_H /* <stdlib.h> should declare getenv() */ +extern char *getenv (const char *name); +#endif +#endif + + +LOCAL(size_t) +round_up_pow2 (size_t a, size_t b) +/* a rounded up to the next multiple of b, i.e. ceil(a/b)*b */ +/* Assumes a >= 0, b > 0, and b is a power of 2 */ +{ + return ((a + b - 1) & (~(b - 1))); +} + + +/* + * Some important notes: + * The allocation routines provided here must never return NULL. + * They should exit to error_exit if unsuccessful. + * + * It's not a good idea to try to merge the sarray and barray routines, + * even though they are textually almost the same, because samples are + * usually stored as bytes while coefficients are shorts or ints. Thus, + * in machines where byte pointers have a different representation from + * word pointers, the resulting machine code could not be the same. + */ + + +/* + * Many machines require storage alignment: longs must start on 4-byte + * boundaries, doubles on 8-byte boundaries, etc. On such machines, malloc() + * always returns pointers that are multiples of the worst-case alignment + * requirement, and we had better do so too. + * There isn't any really portable way to determine the worst-case alignment + * requirement. This module assumes that the alignment requirement is + * multiples of ALIGN_SIZE. + * By default, we define ALIGN_SIZE as sizeof(double). This is necessary on + * some workstations (where doubles really do need 8-byte alignment) and will + * work fine on nearly everything. If your machine has lesser alignment needs, + * you can save a few bytes by making ALIGN_SIZE smaller. + * The only place I know of where this will NOT work is certain Macintosh + * 680x0 compilers that define double as a 10-byte IEEE extended float. + * Doing 10-byte alignment is counterproductive because longwords won't be + * aligned well. Put "#define ALIGN_SIZE 4" in jconfig.h if you have + * such a compiler. + */ + +#ifndef ALIGN_SIZE /* so can override from jconfig.h */ +#ifndef WITH_SIMD +#define ALIGN_SIZE sizeof(double) +#else +#define ALIGN_SIZE 16 /* Most SIMD implementations require this */ +#endif +#endif + +/* + * We allocate objects from "pools", where each pool is gotten with a single + * request to jpeg_get_small() or jpeg_get_large(). There is no per-object + * overhead within a pool, except for alignment padding. Each pool has a + * header with a link to the next pool of the same class. + * Small and large pool headers are identical. + */ + +typedef struct small_pool_struct *small_pool_ptr; + +typedef struct small_pool_struct { + small_pool_ptr next; /* next in list of pools */ + size_t bytes_used; /* how many bytes already used within pool */ + size_t bytes_left; /* bytes still available in this pool */ +} small_pool_hdr; + +typedef struct large_pool_struct *large_pool_ptr; + +typedef struct large_pool_struct { + large_pool_ptr next; /* next in list of pools */ + size_t bytes_used; /* how many bytes already used within pool */ + size_t bytes_left; /* bytes still available in this pool */ +} large_pool_hdr; + +/* + * Here is the full definition of a memory manager object. + */ + +typedef struct { + struct jpeg_memory_mgr pub; /* public fields */ + + /* Each pool identifier (lifetime class) names a linked list of pools. */ + small_pool_ptr small_list[JPOOL_NUMPOOLS]; + large_pool_ptr large_list[JPOOL_NUMPOOLS]; + + /* Since we only have one lifetime class of virtual arrays, only one + * linked list is necessary (for each datatype). Note that the virtual + * array control blocks being linked together are actually stored somewhere + * in the small-pool list. + */ + jvirt_sarray_ptr virt_sarray_list; + jvirt_barray_ptr virt_barray_list; + + /* This counts total space obtained from jpeg_get_small/large */ + size_t total_space_allocated; + + /* alloc_sarray and alloc_barray set this value for use by virtual + * array routines. + */ + JDIMENSION last_rowsperchunk; /* from most recent alloc_sarray/barray */ +} my_memory_mgr; + +typedef my_memory_mgr *my_mem_ptr; + + +/* + * The control blocks for virtual arrays. + * Note that these blocks are allocated in the "small" pool area. + * System-dependent info for the associated backing store (if any) is hidden + * inside the backing_store_info struct. + */ + +struct jvirt_sarray_control { + JSAMPARRAY mem_buffer; /* => the in-memory buffer */ + JDIMENSION rows_in_array; /* total virtual array height */ + JDIMENSION samplesperrow; /* width of array (and of memory buffer) */ + JDIMENSION maxaccess; /* max rows accessed by access_virt_sarray */ + JDIMENSION rows_in_mem; /* height of memory buffer */ + JDIMENSION rowsperchunk; /* allocation chunk size in mem_buffer */ + JDIMENSION cur_start_row; /* first logical row # in the buffer */ + JDIMENSION first_undef_row; /* row # of first uninitialized row */ + boolean pre_zero; /* pre-zero mode requested? */ + boolean dirty; /* do current buffer contents need written? */ + boolean b_s_open; /* is backing-store data valid? */ + jvirt_sarray_ptr next; /* link to next virtual sarray control block */ + backing_store_info b_s_info; /* System-dependent control info */ +}; + +struct jvirt_barray_control { + JBLOCKARRAY mem_buffer; /* => the in-memory buffer */ + JDIMENSION rows_in_array; /* total virtual array height */ + JDIMENSION blocksperrow; /* width of array (and of memory buffer) */ + JDIMENSION maxaccess; /* max rows accessed by access_virt_barray */ + JDIMENSION rows_in_mem; /* height of memory buffer */ + JDIMENSION rowsperchunk; /* allocation chunk size in mem_buffer */ + JDIMENSION cur_start_row; /* first logical row # in the buffer */ + JDIMENSION first_undef_row; /* row # of first uninitialized row */ + boolean pre_zero; /* pre-zero mode requested? */ + boolean dirty; /* do current buffer contents need written? */ + boolean b_s_open; /* is backing-store data valid? */ + jvirt_barray_ptr next; /* link to next virtual barray control block */ + backing_store_info b_s_info; /* System-dependent control info */ +}; + + +#ifdef MEM_STATS /* optional extra stuff for statistics */ + +LOCAL(void) +print_mem_stats (j_common_ptr cinfo, int pool_id) +{ + my_mem_ptr mem = (my_mem_ptr) cinfo->mem; + small_pool_ptr shdr_ptr; + large_pool_ptr lhdr_ptr; + + /* Since this is only a debugging stub, we can cheat a little by using + * fprintf directly rather than going through the trace message code. + * This is helpful because message parm array can't handle longs. + */ + fprintf(stderr, "Freeing pool %d, total space = %ld\n", + pool_id, mem->total_space_allocated); + + for (lhdr_ptr = mem->large_list[pool_id]; lhdr_ptr != NULL; + lhdr_ptr = lhdr_ptr->next) { + fprintf(stderr, " Large chunk used %ld\n", + (long) lhdr_ptr->bytes_used); + } + + for (shdr_ptr = mem->small_list[pool_id]; shdr_ptr != NULL; + shdr_ptr = shdr_ptr->next) { + fprintf(stderr, " Small chunk used %ld free %ld\n", + (long) shdr_ptr->bytes_used, + (long) shdr_ptr->bytes_left); + } +} + +#endif /* MEM_STATS */ + + +LOCAL(void) +out_of_memory (j_common_ptr cinfo, int which) +/* Report an out-of-memory error and stop execution */ +/* If we compiled MEM_STATS support, report alloc requests before dying */ +{ +#ifdef MEM_STATS + cinfo->err->trace_level = 2; /* force self_destruct to report stats */ +#endif + ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, which); +} + + +/* + * Allocation of "small" objects. + * + * For these, we use pooled storage. When a new pool must be created, + * we try to get enough space for the current request plus a "slop" factor, + * where the slop will be the amount of leftover space in the new pool. + * The speed vs. space tradeoff is largely determined by the slop values. + * A different slop value is provided for each pool class (lifetime), + * and we also distinguish the first pool of a class from later ones. + * NOTE: the values given work fairly well on both 16- and 32-bit-int + * machines, but may be too small if longs are 64 bits or more. + * + * Since we do not know what alignment malloc() gives us, we have to + * allocate ALIGN_SIZE-1 extra space per pool to have room for alignment + * adjustment. + */ + +static const size_t first_pool_slop[JPOOL_NUMPOOLS] = +{ + 1600, /* first PERMANENT pool */ + 16000 /* first IMAGE pool */ +}; + +static const size_t extra_pool_slop[JPOOL_NUMPOOLS] = +{ + 0, /* additional PERMANENT pools */ + 5000 /* additional IMAGE pools */ +}; + +#define MIN_SLOP 50 /* greater than 0 to avoid futile looping */ + + +METHODDEF(void *) +alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject) +/* Allocate a "small" object */ +{ + my_mem_ptr mem = (my_mem_ptr) cinfo->mem; + small_pool_ptr hdr_ptr, prev_hdr_ptr; + char *data_ptr; + size_t min_request, slop; + + /* + * Round up the requested size to a multiple of ALIGN_SIZE in order + * to assure alignment for the next object allocated in the same pool + * and so that algorithms can straddle outside the proper area up + * to the next alignment. + */ + if (sizeofobject > MAX_ALLOC_CHUNK) { + /* This prevents overflow/wrap-around in round_up_pow2() if sizeofobject + is close to SIZE_MAX. */ + out_of_memory(cinfo, 7); + } + sizeofobject = round_up_pow2(sizeofobject, ALIGN_SIZE); + + /* Check for unsatisfiable request (do now to ensure no overflow below) */ + if ((sizeof(small_pool_hdr) + sizeofobject + ALIGN_SIZE - 1) > + MAX_ALLOC_CHUNK) + out_of_memory(cinfo, 1); /* request exceeds malloc's ability */ + + /* See if space is available in any existing pool */ + if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS) + ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */ + prev_hdr_ptr = NULL; + hdr_ptr = mem->small_list[pool_id]; + while (hdr_ptr != NULL) { + if (hdr_ptr->bytes_left >= sizeofobject) + break; /* found pool with enough space */ + prev_hdr_ptr = hdr_ptr; + hdr_ptr = hdr_ptr->next; + } + + /* Time to make a new pool? */ + if (hdr_ptr == NULL) { + /* min_request is what we need now, slop is what will be leftover */ + min_request = sizeof(small_pool_hdr) + sizeofobject + ALIGN_SIZE - 1; + if (prev_hdr_ptr == NULL) /* first pool in class? */ + slop = first_pool_slop[pool_id]; + else + slop = extra_pool_slop[pool_id]; + /* Don't ask for more than MAX_ALLOC_CHUNK */ + if (slop > (size_t) (MAX_ALLOC_CHUNK-min_request)) + slop = (size_t) (MAX_ALLOC_CHUNK-min_request); + /* Try to get space, if fail reduce slop and try again */ + for (;;) { + hdr_ptr = (small_pool_ptr) jpeg_get_small(cinfo, min_request + slop); + if (hdr_ptr != NULL) + break; + slop /= 2; + if (slop < MIN_SLOP) /* give up when it gets real small */ + out_of_memory(cinfo, 2); /* jpeg_get_small failed */ + } + mem->total_space_allocated += min_request + slop; + /* Success, initialize the new pool header and add to end of list */ + hdr_ptr->next = NULL; + hdr_ptr->bytes_used = 0; + hdr_ptr->bytes_left = sizeofobject + slop; + if (prev_hdr_ptr == NULL) /* first pool in class? */ + mem->small_list[pool_id] = hdr_ptr; + else + prev_hdr_ptr->next = hdr_ptr; + } + + /* OK, allocate the object from the current pool */ + data_ptr = (char *) hdr_ptr; /* point to first data byte in pool... */ + data_ptr += sizeof(small_pool_hdr); /* ...by skipping the header... */ + if ((size_t)data_ptr % ALIGN_SIZE) /* ...and adjust for alignment */ + data_ptr += ALIGN_SIZE - (size_t)data_ptr % ALIGN_SIZE; + data_ptr += hdr_ptr->bytes_used; /* point to place for object */ + hdr_ptr->bytes_used += sizeofobject; + hdr_ptr->bytes_left -= sizeofobject; + + return (void *) data_ptr; +} + + +/* + * Allocation of "large" objects. + * + * The external semantics of these are the same as "small" objects. However, + * the pool management heuristics are quite different. We assume that each + * request is large enough that it may as well be passed directly to + * jpeg_get_large; the pool management just links everything together + * so that we can free it all on demand. + * Note: the major use of "large" objects is in JSAMPARRAY and JBLOCKARRAY + * structures. The routines that create these structures (see below) + * deliberately bunch rows together to ensure a large request size. + */ + +METHODDEF(void *) +alloc_large (j_common_ptr cinfo, int pool_id, size_t sizeofobject) +/* Allocate a "large" object */ +{ + my_mem_ptr mem = (my_mem_ptr) cinfo->mem; + large_pool_ptr hdr_ptr; + char *data_ptr; + + /* + * Round up the requested size to a multiple of ALIGN_SIZE so that + * algorithms can straddle outside the proper area up to the next + * alignment. + */ + if (sizeofobject > MAX_ALLOC_CHUNK) { + /* This prevents overflow/wrap-around in round_up_pow2() if sizeofobject + is close to SIZE_MAX. */ + out_of_memory(cinfo, 8); + } + sizeofobject = round_up_pow2(sizeofobject, ALIGN_SIZE); + + /* Check for unsatisfiable request (do now to ensure no overflow below) */ + if ((sizeof(large_pool_hdr) + sizeofobject + ALIGN_SIZE - 1) > + MAX_ALLOC_CHUNK) + out_of_memory(cinfo, 3); /* request exceeds malloc's ability */ + + /* Always make a new pool */ + if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS) + ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */ + + hdr_ptr = (large_pool_ptr) jpeg_get_large(cinfo, sizeofobject + + sizeof(large_pool_hdr) + + ALIGN_SIZE - 1); + if (hdr_ptr == NULL) + out_of_memory(cinfo, 4); /* jpeg_get_large failed */ + mem->total_space_allocated += sizeofobject + sizeof(large_pool_hdr) + + ALIGN_SIZE - 1; + + /* Success, initialize the new pool header and add to list */ + hdr_ptr->next = mem->large_list[pool_id]; + /* We maintain space counts in each pool header for statistical purposes, + * even though they are not needed for allocation. + */ + hdr_ptr->bytes_used = sizeofobject; + hdr_ptr->bytes_left = 0; + mem->large_list[pool_id] = hdr_ptr; + + data_ptr = (char *) hdr_ptr; /* point to first data byte in pool... */ + data_ptr += sizeof(small_pool_hdr); /* ...by skipping the header... */ + if ((size_t)data_ptr % ALIGN_SIZE) /* ...and adjust for alignment */ + data_ptr += ALIGN_SIZE - (size_t)data_ptr % ALIGN_SIZE; + + return (void *) data_ptr; +} + + +/* + * Creation of 2-D sample arrays. + * + * To minimize allocation overhead and to allow I/O of large contiguous + * blocks, we allocate the sample rows in groups of as many rows as possible + * without exceeding MAX_ALLOC_CHUNK total bytes per allocation request. + * NB: the virtual array control routines, later in this file, know about + * this chunking of rows. The rowsperchunk value is left in the mem manager + * object so that it can be saved away if this sarray is the workspace for + * a virtual array. + * + * Since we are often upsampling with a factor 2, we align the size (not + * the start) to 2 * ALIGN_SIZE so that the upsampling routines don't have + * to be as careful about size. + */ + +METHODDEF(JSAMPARRAY) +alloc_sarray (j_common_ptr cinfo, int pool_id, + JDIMENSION samplesperrow, JDIMENSION numrows) +/* Allocate a 2-D sample array */ +{ + my_mem_ptr mem = (my_mem_ptr) cinfo->mem; + JSAMPARRAY result; + JSAMPROW workspace; + JDIMENSION rowsperchunk, currow, i; + long ltemp; + + /* Make sure each row is properly aligned */ + if ((ALIGN_SIZE % sizeof(JSAMPLE)) != 0) + out_of_memory(cinfo, 5); /* safety check */ + + if (samplesperrow > MAX_ALLOC_CHUNK) { + /* This prevents overflow/wrap-around in round_up_pow2() if sizeofobject + is close to SIZE_MAX. */ + out_of_memory(cinfo, 9); + } + samplesperrow = (JDIMENSION)round_up_pow2(samplesperrow, (2 * ALIGN_SIZE) / + sizeof(JSAMPLE)); + + /* Calculate max # of rows allowed in one allocation chunk */ + ltemp = (MAX_ALLOC_CHUNK-sizeof(large_pool_hdr)) / + ((long) samplesperrow * sizeof(JSAMPLE)); + if (ltemp <= 0) + ERREXIT(cinfo, JERR_WIDTH_OVERFLOW); + if (ltemp < (long) numrows) + rowsperchunk = (JDIMENSION) ltemp; + else + rowsperchunk = numrows; + mem->last_rowsperchunk = rowsperchunk; + + /* Get space for row pointers (small object) */ + result = (JSAMPARRAY) alloc_small(cinfo, pool_id, + (size_t) (numrows * sizeof(JSAMPROW))); + + /* Get the rows themselves (large objects) */ + currow = 0; + while (currow < numrows) { + rowsperchunk = MIN(rowsperchunk, numrows - currow); + workspace = (JSAMPROW) alloc_large(cinfo, pool_id, + (size_t) ((size_t) rowsperchunk * (size_t) samplesperrow + * sizeof(JSAMPLE))); + for (i = rowsperchunk; i > 0; i--) { + result[currow++] = workspace; + workspace += samplesperrow; + } + } + + return result; +} + + +/* + * Creation of 2-D coefficient-block arrays. + * This is essentially the same as the code for sample arrays, above. + */ + +METHODDEF(JBLOCKARRAY) +alloc_barray (j_common_ptr cinfo, int pool_id, + JDIMENSION blocksperrow, JDIMENSION numrows) +/* Allocate a 2-D coefficient-block array */ +{ + my_mem_ptr mem = (my_mem_ptr) cinfo->mem; + JBLOCKARRAY result; + JBLOCKROW workspace; + JDIMENSION rowsperchunk, currow, i; + long ltemp; + + /* Make sure each row is properly aligned */ + if ((sizeof(JBLOCK) % ALIGN_SIZE) != 0) + out_of_memory(cinfo, 6); /* safety check */ + + /* Calculate max # of rows allowed in one allocation chunk */ + ltemp = (MAX_ALLOC_CHUNK-sizeof(large_pool_hdr)) / + ((long) blocksperrow * sizeof(JBLOCK)); + if (ltemp <= 0) + ERREXIT(cinfo, JERR_WIDTH_OVERFLOW); + if (ltemp < (long) numrows) + rowsperchunk = (JDIMENSION) ltemp; + else + rowsperchunk = numrows; + mem->last_rowsperchunk = rowsperchunk; + + /* Get space for row pointers (small object) */ + result = (JBLOCKARRAY) alloc_small(cinfo, pool_id, + (size_t) (numrows * sizeof(JBLOCKROW))); + + /* Get the rows themselves (large objects) */ + currow = 0; + while (currow < numrows) { + rowsperchunk = MIN(rowsperchunk, numrows - currow); + workspace = (JBLOCKROW) alloc_large(cinfo, pool_id, + (size_t) ((size_t) rowsperchunk * (size_t) blocksperrow + * sizeof(JBLOCK))); + for (i = rowsperchunk; i > 0; i--) { + result[currow++] = workspace; + workspace += blocksperrow; + } + } + + return result; +} + + +/* + * About virtual array management: + * + * The above "normal" array routines are only used to allocate strip buffers + * (as wide as the image, but just a few rows high). Full-image-sized buffers + * are handled as "virtual" arrays. The array is still accessed a strip at a + * time, but the memory manager must save the whole array for repeated + * accesses. The intended implementation is that there is a strip buffer in + * memory (as high as is possible given the desired memory limit), plus a + * backing file that holds the rest of the array. + * + * The request_virt_array routines are told the total size of the image and + * the maximum number of rows that will be accessed at once. The in-memory + * buffer must be at least as large as the maxaccess value. + * + * The request routines create control blocks but not the in-memory buffers. + * That is postponed until realize_virt_arrays is called. At that time the + * total amount of space needed is known (approximately, anyway), so free + * memory can be divided up fairly. + * + * The access_virt_array routines are responsible for making a specific strip + * area accessible (after reading or writing the backing file, if necessary). + * Note that the access routines are told whether the caller intends to modify + * the accessed strip; during a read-only pass this saves having to rewrite + * data to disk. The access routines are also responsible for pre-zeroing + * any newly accessed rows, if pre-zeroing was requested. + * + * In current usage, the access requests are usually for nonoverlapping + * strips; that is, successive access start_row numbers differ by exactly + * num_rows = maxaccess. This means we can get good performance with simple + * buffer dump/reload logic, by making the in-memory buffer be a multiple + * of the access height; then there will never be accesses across bufferload + * boundaries. The code will still work with overlapping access requests, + * but it doesn't handle bufferload overlaps very efficiently. + */ + + +METHODDEF(jvirt_sarray_ptr) +request_virt_sarray (j_common_ptr cinfo, int pool_id, boolean pre_zero, + JDIMENSION samplesperrow, JDIMENSION numrows, + JDIMENSION maxaccess) +/* Request a virtual 2-D sample array */ +{ + my_mem_ptr mem = (my_mem_ptr) cinfo->mem; + jvirt_sarray_ptr result; + + /* Only IMAGE-lifetime virtual arrays are currently supported */ + if (pool_id != JPOOL_IMAGE) + ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */ + + /* get control block */ + result = (jvirt_sarray_ptr) alloc_small(cinfo, pool_id, + sizeof(struct jvirt_sarray_control)); + + result->mem_buffer = NULL; /* marks array not yet realized */ + result->rows_in_array = numrows; + result->samplesperrow = samplesperrow; + result->maxaccess = maxaccess; + result->pre_zero = pre_zero; + result->b_s_open = FALSE; /* no associated backing-store object */ + result->next = mem->virt_sarray_list; /* add to list of virtual arrays */ + mem->virt_sarray_list = result; + + return result; +} + + +METHODDEF(jvirt_barray_ptr) +request_virt_barray (j_common_ptr cinfo, int pool_id, boolean pre_zero, + JDIMENSION blocksperrow, JDIMENSION numrows, + JDIMENSION maxaccess) +/* Request a virtual 2-D coefficient-block array */ +{ + my_mem_ptr mem = (my_mem_ptr) cinfo->mem; + jvirt_barray_ptr result; + + /* Only IMAGE-lifetime virtual arrays are currently supported */ + if (pool_id != JPOOL_IMAGE) + ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */ + + /* get control block */ + result = (jvirt_barray_ptr) alloc_small(cinfo, pool_id, + sizeof(struct jvirt_barray_control)); + + result->mem_buffer = NULL; /* marks array not yet realized */ + result->rows_in_array = numrows; + result->blocksperrow = blocksperrow; + result->maxaccess = maxaccess; + result->pre_zero = pre_zero; + result->b_s_open = FALSE; /* no associated backing-store object */ + result->next = mem->virt_barray_list; /* add to list of virtual arrays */ + mem->virt_barray_list = result; + + return result; +} + + +METHODDEF(void) +realize_virt_arrays (j_common_ptr cinfo) +/* Allocate the in-memory buffers for any unrealized virtual arrays */ +{ + my_mem_ptr mem = (my_mem_ptr) cinfo->mem; + size_t space_per_minheight, maximum_space, avail_mem; + size_t minheights, max_minheights; + jvirt_sarray_ptr sptr; + jvirt_barray_ptr bptr; + + /* Compute the minimum space needed (maxaccess rows in each buffer) + * and the maximum space needed (full image height in each buffer). + * These may be of use to the system-dependent jpeg_mem_available routine. + */ + space_per_minheight = 0; + maximum_space = 0; + for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) { + if (sptr->mem_buffer == NULL) { /* if not realized yet */ + size_t new_space = (long) sptr->rows_in_array * + (long) sptr->samplesperrow * sizeof(JSAMPLE); + + space_per_minheight += (long) sptr->maxaccess * + (long) sptr->samplesperrow * sizeof(JSAMPLE); + if (SIZE_MAX - maximum_space < new_space) + out_of_memory(cinfo, 10); + maximum_space += new_space; + } + } + for (bptr = mem->virt_barray_list; bptr != NULL; bptr = bptr->next) { + if (bptr->mem_buffer == NULL) { /* if not realized yet */ + size_t new_space = (long) bptr->rows_in_array * + (long) bptr->blocksperrow * sizeof(JBLOCK); + + space_per_minheight += (long) bptr->maxaccess * + (long) bptr->blocksperrow * sizeof(JBLOCK); + if (SIZE_MAX - maximum_space < new_space) + out_of_memory(cinfo, 11); + maximum_space += new_space; + } + } + + if (space_per_minheight <= 0) + return; /* no unrealized arrays, no work */ + + /* Determine amount of memory to actually use; this is system-dependent. */ + avail_mem = jpeg_mem_available(cinfo, space_per_minheight, maximum_space, + mem->total_space_allocated); + + /* If the maximum space needed is available, make all the buffers full + * height; otherwise parcel it out with the same number of minheights + * in each buffer. + */ + if (avail_mem >= maximum_space) + max_minheights = 1000000000L; + else { + max_minheights = avail_mem / space_per_minheight; + /* If there doesn't seem to be enough space, try to get the minimum + * anyway. This allows a "stub" implementation of jpeg_mem_available(). + */ + if (max_minheights <= 0) + max_minheights = 1; + } + + /* Allocate the in-memory buffers and initialize backing store as needed. */ + + for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) { + if (sptr->mem_buffer == NULL) { /* if not realized yet */ + minheights = ((long) sptr->rows_in_array - 1L) / sptr->maxaccess + 1L; + if (minheights <= max_minheights) { + /* This buffer fits in memory */ + sptr->rows_in_mem = sptr->rows_in_array; + } else { + /* It doesn't fit in memory, create backing store. */ + sptr->rows_in_mem = (JDIMENSION) (max_minheights * sptr->maxaccess); + jpeg_open_backing_store(cinfo, & sptr->b_s_info, + (long) sptr->rows_in_array * + (long) sptr->samplesperrow * + (long) sizeof(JSAMPLE)); + sptr->b_s_open = TRUE; + } + sptr->mem_buffer = alloc_sarray(cinfo, JPOOL_IMAGE, + sptr->samplesperrow, sptr->rows_in_mem); + sptr->rowsperchunk = mem->last_rowsperchunk; + sptr->cur_start_row = 0; + sptr->first_undef_row = 0; + sptr->dirty = FALSE; + } + } + + for (bptr = mem->virt_barray_list; bptr != NULL; bptr = bptr->next) { + if (bptr->mem_buffer == NULL) { /* if not realized yet */ + minheights = ((long) bptr->rows_in_array - 1L) / bptr->maxaccess + 1L; + if (minheights <= max_minheights) { + /* This buffer fits in memory */ + bptr->rows_in_mem = bptr->rows_in_array; + } else { + /* It doesn't fit in memory, create backing store. */ + bptr->rows_in_mem = (JDIMENSION) (max_minheights * bptr->maxaccess); + jpeg_open_backing_store(cinfo, & bptr->b_s_info, + (long) bptr->rows_in_array * + (long) bptr->blocksperrow * + (long) sizeof(JBLOCK)); + bptr->b_s_open = TRUE; + } + bptr->mem_buffer = alloc_barray(cinfo, JPOOL_IMAGE, + bptr->blocksperrow, bptr->rows_in_mem); + bptr->rowsperchunk = mem->last_rowsperchunk; + bptr->cur_start_row = 0; + bptr->first_undef_row = 0; + bptr->dirty = FALSE; + } + } +} + + +LOCAL(void) +do_sarray_io (j_common_ptr cinfo, jvirt_sarray_ptr ptr, boolean writing) +/* Do backing store read or write of a virtual sample array */ +{ + long bytesperrow, file_offset, byte_count, rows, thisrow, i; + + bytesperrow = (long) ptr->samplesperrow * sizeof(JSAMPLE); + file_offset = ptr->cur_start_row * bytesperrow; + /* Loop to read or write each allocation chunk in mem_buffer */ + for (i = 0; i < (long) ptr->rows_in_mem; i += ptr->rowsperchunk) { + /* One chunk, but check for short chunk at end of buffer */ + rows = MIN((long) ptr->rowsperchunk, (long) ptr->rows_in_mem - i); + /* Transfer no more than is currently defined */ + thisrow = (long) ptr->cur_start_row + i; + rows = MIN(rows, (long) ptr->first_undef_row - thisrow); + /* Transfer no more than fits in file */ + rows = MIN(rows, (long) ptr->rows_in_array - thisrow); + if (rows <= 0) /* this chunk might be past end of file! */ + break; + byte_count = rows * bytesperrow; + if (writing) + (*ptr->b_s_info.write_backing_store) (cinfo, & ptr->b_s_info, + (void *) ptr->mem_buffer[i], + file_offset, byte_count); + else + (*ptr->b_s_info.read_backing_store) (cinfo, & ptr->b_s_info, + (void *) ptr->mem_buffer[i], + file_offset, byte_count); + file_offset += byte_count; + } +} + + +LOCAL(void) +do_barray_io (j_common_ptr cinfo, jvirt_barray_ptr ptr, boolean writing) +/* Do backing store read or write of a virtual coefficient-block array */ +{ + long bytesperrow, file_offset, byte_count, rows, thisrow, i; + + bytesperrow = (long) ptr->blocksperrow * sizeof(JBLOCK); + file_offset = ptr->cur_start_row * bytesperrow; + /* Loop to read or write each allocation chunk in mem_buffer */ + for (i = 0; i < (long) ptr->rows_in_mem; i += ptr->rowsperchunk) { + /* One chunk, but check for short chunk at end of buffer */ + rows = MIN((long) ptr->rowsperchunk, (long) ptr->rows_in_mem - i); + /* Transfer no more than is currently defined */ + thisrow = (long) ptr->cur_start_row + i; + rows = MIN(rows, (long) ptr->first_undef_row - thisrow); + /* Transfer no more than fits in file */ + rows = MIN(rows, (long) ptr->rows_in_array - thisrow); + if (rows <= 0) /* this chunk might be past end of file! */ + break; + byte_count = rows * bytesperrow; + if (writing) + (*ptr->b_s_info.write_backing_store) (cinfo, & ptr->b_s_info, + (void *) ptr->mem_buffer[i], + file_offset, byte_count); + else + (*ptr->b_s_info.read_backing_store) (cinfo, & ptr->b_s_info, + (void *) ptr->mem_buffer[i], + file_offset, byte_count); + file_offset += byte_count; + } +} + + +METHODDEF(JSAMPARRAY) +access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr, + JDIMENSION start_row, JDIMENSION num_rows, + boolean writable) +/* Access the part of a virtual sample array starting at start_row */ +/* and extending for num_rows rows. writable is true if */ +/* caller intends to modify the accessed area. */ +{ + JDIMENSION end_row = start_row + num_rows; + JDIMENSION undef_row; + + /* debugging check */ + if (end_row > ptr->rows_in_array || num_rows > ptr->maxaccess || + ptr->mem_buffer == NULL) + ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS); + + /* Make the desired part of the virtual array accessible */ + if (start_row < ptr->cur_start_row || + end_row > ptr->cur_start_row+ptr->rows_in_mem) { + if (! ptr->b_s_open) + ERREXIT(cinfo, JERR_VIRTUAL_BUG); + /* Flush old buffer contents if necessary */ + if (ptr->dirty) { + do_sarray_io(cinfo, ptr, TRUE); + ptr->dirty = FALSE; + } + /* Decide what part of virtual array to access. + * Algorithm: if target address > current window, assume forward scan, + * load starting at target address. If target address < current window, + * assume backward scan, load so that target area is top of window. + * Note that when switching from forward write to forward read, will have + * start_row = 0, so the limiting case applies and we load from 0 anyway. + */ + if (start_row > ptr->cur_start_row) { + ptr->cur_start_row = start_row; + } else { + /* use long arithmetic here to avoid overflow & unsigned problems */ + long ltemp; + + ltemp = (long) end_row - (long) ptr->rows_in_mem; + if (ltemp < 0) + ltemp = 0; /* don't fall off front end of file */ + ptr->cur_start_row = (JDIMENSION) ltemp; + } + /* Read in the selected part of the array. + * During the initial write pass, we will do no actual read + * because the selected part is all undefined. + */ + do_sarray_io(cinfo, ptr, FALSE); + } + /* Ensure the accessed part of the array is defined; prezero if needed. + * To improve locality of access, we only prezero the part of the array + * that the caller is about to access, not the entire in-memory array. + */ + if (ptr->first_undef_row < end_row) { + if (ptr->first_undef_row < start_row) { + if (writable) /* writer skipped over a section of array */ + ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS); + undef_row = start_row; /* but reader is allowed to read ahead */ + } else { + undef_row = ptr->first_undef_row; + } + if (writable) + ptr->first_undef_row = end_row; + if (ptr->pre_zero) { + size_t bytesperrow = (size_t) ptr->samplesperrow * sizeof(JSAMPLE); + undef_row -= ptr->cur_start_row; /* make indexes relative to buffer */ + end_row -= ptr->cur_start_row; + while (undef_row < end_row) { + jzero_far((void *) ptr->mem_buffer[undef_row], bytesperrow); + undef_row++; + } + } else { + if (! writable) /* reader looking at undefined data */ + ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS); + } + } + /* Flag the buffer dirty if caller will write in it */ + if (writable) + ptr->dirty = TRUE; + /* Return address of proper part of the buffer */ + return ptr->mem_buffer + (start_row - ptr->cur_start_row); +} + + +METHODDEF(JBLOCKARRAY) +access_virt_barray (j_common_ptr cinfo, jvirt_barray_ptr ptr, + JDIMENSION start_row, JDIMENSION num_rows, + boolean writable) +/* Access the part of a virtual block array starting at start_row */ +/* and extending for num_rows rows. writable is true if */ +/* caller intends to modify the accessed area. */ +{ + JDIMENSION end_row = start_row + num_rows; + JDIMENSION undef_row; + + /* debugging check */ + if (end_row > ptr->rows_in_array || num_rows > ptr->maxaccess || + ptr->mem_buffer == NULL) + ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS); + + /* Make the desired part of the virtual array accessible */ + if (start_row < ptr->cur_start_row || + end_row > ptr->cur_start_row+ptr->rows_in_mem) { + if (! ptr->b_s_open) + ERREXIT(cinfo, JERR_VIRTUAL_BUG); + /* Flush old buffer contents if necessary */ + if (ptr->dirty) { + do_barray_io(cinfo, ptr, TRUE); + ptr->dirty = FALSE; + } + /* Decide what part of virtual array to access. + * Algorithm: if target address > current window, assume forward scan, + * load starting at target address. If target address < current window, + * assume backward scan, load so that target area is top of window. + * Note that when switching from forward write to forward read, will have + * start_row = 0, so the limiting case applies and we load from 0 anyway. + */ + if (start_row > ptr->cur_start_row) { + ptr->cur_start_row = start_row; + } else { + /* use long arithmetic here to avoid overflow & unsigned problems */ + long ltemp; + + ltemp = (long) end_row - (long) ptr->rows_in_mem; + if (ltemp < 0) + ltemp = 0; /* don't fall off front end of file */ + ptr->cur_start_row = (JDIMENSION) ltemp; + } + /* Read in the selected part of the array. + * During the initial write pass, we will do no actual read + * because the selected part is all undefined. + */ + do_barray_io(cinfo, ptr, FALSE); + } + /* Ensure the accessed part of the array is defined; prezero if needed. + * To improve locality of access, we only prezero the part of the array + * that the caller is about to access, not the entire in-memory array. + */ + if (ptr->first_undef_row < end_row) { + if (ptr->first_undef_row < start_row) { + if (writable) /* writer skipped over a section of array */ + ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS); + undef_row = start_row; /* but reader is allowed to read ahead */ + } else { + undef_row = ptr->first_undef_row; + } + if (writable) + ptr->first_undef_row = end_row; + if (ptr->pre_zero) { + size_t bytesperrow = (size_t) ptr->blocksperrow * sizeof(JBLOCK); + undef_row -= ptr->cur_start_row; /* make indexes relative to buffer */ + end_row -= ptr->cur_start_row; + while (undef_row < end_row) { + jzero_far((void *) ptr->mem_buffer[undef_row], bytesperrow); + undef_row++; + } + } else { + if (! writable) /* reader looking at undefined data */ + ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS); + } + } + /* Flag the buffer dirty if caller will write in it */ + if (writable) + ptr->dirty = TRUE; + /* Return address of proper part of the buffer */ + return ptr->mem_buffer + (start_row - ptr->cur_start_row); +} + + +/* + * Release all objects belonging to a specified pool. + */ + +METHODDEF(void) +free_pool (j_common_ptr cinfo, int pool_id) +{ + my_mem_ptr mem = (my_mem_ptr) cinfo->mem; + small_pool_ptr shdr_ptr; + large_pool_ptr lhdr_ptr; + size_t space_freed; + + if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS) + ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */ + +#ifdef MEM_STATS + if (cinfo->err->trace_level > 1) + print_mem_stats(cinfo, pool_id); /* print pool's memory usage statistics */ +#endif + + /* If freeing IMAGE pool, close any virtual arrays first */ + if (pool_id == JPOOL_IMAGE) { + jvirt_sarray_ptr sptr; + jvirt_barray_ptr bptr; + + for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) { + if (sptr->b_s_open) { /* there may be no backing store */ + sptr->b_s_open = FALSE; /* prevent recursive close if error */ + (*sptr->b_s_info.close_backing_store) (cinfo, & sptr->b_s_info); + } + } + mem->virt_sarray_list = NULL; + for (bptr = mem->virt_barray_list; bptr != NULL; bptr = bptr->next) { + if (bptr->b_s_open) { /* there may be no backing store */ + bptr->b_s_open = FALSE; /* prevent recursive close if error */ + (*bptr->b_s_info.close_backing_store) (cinfo, & bptr->b_s_info); + } + } + mem->virt_barray_list = NULL; + } + + /* Release large objects */ + lhdr_ptr = mem->large_list[pool_id]; + mem->large_list[pool_id] = NULL; + + while (lhdr_ptr != NULL) { + large_pool_ptr next_lhdr_ptr = lhdr_ptr->next; + space_freed = lhdr_ptr->bytes_used + + lhdr_ptr->bytes_left + + sizeof(large_pool_hdr); + jpeg_free_large(cinfo, (void *) lhdr_ptr, space_freed); + mem->total_space_allocated -= space_freed; + lhdr_ptr = next_lhdr_ptr; + } + + /* Release small objects */ + shdr_ptr = mem->small_list[pool_id]; + mem->small_list[pool_id] = NULL; + + while (shdr_ptr != NULL) { + small_pool_ptr next_shdr_ptr = shdr_ptr->next; + space_freed = shdr_ptr->bytes_used + + shdr_ptr->bytes_left + + sizeof(small_pool_hdr); + jpeg_free_small(cinfo, (void *) shdr_ptr, space_freed); + mem->total_space_allocated -= space_freed; + shdr_ptr = next_shdr_ptr; + } +} + + +/* + * Close up shop entirely. + * Note that this cannot be called unless cinfo->mem is non-NULL. + */ + +METHODDEF(void) +self_destruct (j_common_ptr cinfo) +{ + int pool; + + /* Close all backing store, release all memory. + * Releasing pools in reverse order might help avoid fragmentation + * with some (brain-damaged) malloc libraries. + */ + for (pool = JPOOL_NUMPOOLS-1; pool >= JPOOL_PERMANENT; pool--) { + free_pool(cinfo, pool); + } + + /* Release the memory manager control block too. */ + jpeg_free_small(cinfo, (void *) cinfo->mem, sizeof(my_memory_mgr)); + cinfo->mem = NULL; /* ensures I will be called only once */ + + jpeg_mem_term(cinfo); /* system-dependent cleanup */ +} + + +/* + * Memory manager initialization. + * When this is called, only the error manager pointer is valid in cinfo! + */ + +GLOBAL(void) +jinit_memory_mgr (j_common_ptr cinfo) +{ + my_mem_ptr mem; + long max_to_use; + int pool; + size_t test_mac; + + cinfo->mem = NULL; /* for safety if init fails */ + + /* Check for configuration errors. + * sizeof(ALIGN_TYPE) should be a power of 2; otherwise, it probably + * doesn't reflect any real hardware alignment requirement. + * The test is a little tricky: for X>0, X and X-1 have no one-bits + * in common if and only if X is a power of 2, ie has only one one-bit. + * Some compilers may give an "unreachable code" warning here; ignore it. + */ + if ((ALIGN_SIZE & (ALIGN_SIZE-1)) != 0) + ERREXIT(cinfo, JERR_BAD_ALIGN_TYPE); + /* MAX_ALLOC_CHUNK must be representable as type size_t, and must be + * a multiple of ALIGN_SIZE. + * Again, an "unreachable code" warning may be ignored here. + * But a "constant too large" warning means you need to fix MAX_ALLOC_CHUNK. + */ + test_mac = (size_t) MAX_ALLOC_CHUNK; + if ((long) test_mac != MAX_ALLOC_CHUNK || + (MAX_ALLOC_CHUNK % ALIGN_SIZE) != 0) + ERREXIT(cinfo, JERR_BAD_ALLOC_CHUNK); + + max_to_use = jpeg_mem_init(cinfo); /* system-dependent initialization */ + + /* Attempt to allocate memory manager's control block */ + mem = (my_mem_ptr) jpeg_get_small(cinfo, sizeof(my_memory_mgr)); + + if (mem == NULL) { + jpeg_mem_term(cinfo); /* system-dependent cleanup */ + ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 0); + } + + /* OK, fill in the method pointers */ + mem->pub.alloc_small = alloc_small; + mem->pub.alloc_large = alloc_large; + mem->pub.alloc_sarray = alloc_sarray; + mem->pub.alloc_barray = alloc_barray; + mem->pub.request_virt_sarray = request_virt_sarray; + mem->pub.request_virt_barray = request_virt_barray; + mem->pub.realize_virt_arrays = realize_virt_arrays; + mem->pub.access_virt_sarray = access_virt_sarray; + mem->pub.access_virt_barray = access_virt_barray; + mem->pub.free_pool = free_pool; + mem->pub.self_destruct = self_destruct; + + /* Make MAX_ALLOC_CHUNK accessible to other modules */ + mem->pub.max_alloc_chunk = MAX_ALLOC_CHUNK; + + /* Initialize working state */ + mem->pub.max_memory_to_use = max_to_use; + + for (pool = JPOOL_NUMPOOLS-1; pool >= JPOOL_PERMANENT; pool--) { + mem->small_list[pool] = NULL; + mem->large_list[pool] = NULL; + } + mem->virt_sarray_list = NULL; + mem->virt_barray_list = NULL; + + mem->total_space_allocated = sizeof(my_memory_mgr); + + /* Declare ourselves open for business */ + cinfo->mem = & mem->pub; + + /* Check for an environment variable JPEGMEM; if found, override the + * default max_memory setting from jpeg_mem_init. Note that the + * surrounding application may again override this value. + * If your system doesn't support getenv(), define NO_GETENV to disable + * this feature. + */ +#ifndef NO_GETENV + { char *memenv; + + if ((memenv = getenv("JPEGMEM")) != NULL) { + char ch = 'x'; + + if (sscanf(memenv, "%ld%c", &max_to_use, &ch) > 0) { + if (ch == 'm' || ch == 'M') + max_to_use *= 1000L; + mem->pub.max_memory_to_use = max_to_use * 1000L; + } + } + } +#endif + +} diff --git a/media/libjpeg/jmemnobs.c b/media/libjpeg/jmemnobs.c new file mode 100644 index 000000000..5797198de --- /dev/null +++ b/media/libjpeg/jmemnobs.c @@ -0,0 +1,109 @@ +/* + * jmemnobs.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1992-1996, Thomas G. Lane. + * It was modified by The libjpeg-turbo Project to include only code and + * information relevant to libjpeg-turbo. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file provides a really simple implementation of the system- + * dependent portion of the JPEG memory manager. This implementation + * assumes that no backing-store files are needed: all required space + * can be obtained from malloc(). + * This is very portable in the sense that it'll compile on almost anything, + * but you'd better have lots of main memory (or virtual memory) if you want + * to process big images. + * Note that the max_memory_to_use option is ignored by this implementation. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jmemsys.h" /* import the system-dependent declarations */ + +#ifndef HAVE_STDLIB_H /* <stdlib.h> should declare malloc(),free() */ +extern void *malloc (size_t size); +extern void free (void *ptr); +#endif + + +/* + * Memory allocation and freeing are controlled by the regular library + * routines malloc() and free(). + */ + +GLOBAL(void *) +jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject) +{ + return (void *) malloc(sizeofobject); +} + +GLOBAL(void) +jpeg_free_small (j_common_ptr cinfo, void *object, size_t sizeofobject) +{ + free(object); +} + + +/* + * "Large" objects are treated the same as "small" ones. + */ + +GLOBAL(void *) +jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject) +{ + return (void *) malloc(sizeofobject); +} + +GLOBAL(void) +jpeg_free_large (j_common_ptr cinfo, void *object, size_t sizeofobject) +{ + free(object); +} + + +/* + * This routine computes the total memory space available for allocation. + * Here we always say, "we got all you want bud!" + */ + +GLOBAL(size_t) +jpeg_mem_available (j_common_ptr cinfo, size_t min_bytes_needed, + size_t max_bytes_needed, size_t already_allocated) +{ + return max_bytes_needed; +} + + +/* + * Backing store (temporary file) management. + * Since jpeg_mem_available always promised the moon, + * this should never be called and we can just error out. + */ + +GLOBAL(void) +jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info, + long total_bytes_needed) +{ + ERREXIT(cinfo, JERR_NO_BACKING_STORE); +} + + +/* + * These routines take care of any system-dependent initialization and + * cleanup required. Here, there isn't any. + */ + +GLOBAL(long) +jpeg_mem_init (j_common_ptr cinfo) +{ + return 0; /* just set max_memory_to_use to 0 */ +} + +GLOBAL(void) +jpeg_mem_term (j_common_ptr cinfo) +{ + /* no work */ +} diff --git a/media/libjpeg/jmemsys.h b/media/libjpeg/jmemsys.h new file mode 100644 index 000000000..f7dfe87a8 --- /dev/null +++ b/media/libjpeg/jmemsys.h @@ -0,0 +1,178 @@ +/* + * jmemsys.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1992-1997, Thomas G. Lane. + * It was modified by The libjpeg-turbo Project to include only code and + * information relevant to libjpeg-turbo. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This include file defines the interface between the system-independent + * and system-dependent portions of the JPEG memory manager. No other + * modules need include it. (The system-independent portion is jmemmgr.c; + * there are several different versions of the system-dependent portion.) + * + * This file works as-is for the system-dependent memory managers supplied + * in the IJG distribution. You may need to modify it if you write a + * custom memory manager. If system-dependent changes are needed in + * this file, the best method is to #ifdef them based on a configuration + * symbol supplied in jconfig.h. + */ + + +/* + * These two functions are used to allocate and release small chunks of + * memory. (Typically the total amount requested through jpeg_get_small is + * no more than 20K or so; this will be requested in chunks of a few K each.) + * Behavior should be the same as for the standard library functions malloc + * and free; in particular, jpeg_get_small must return NULL on failure. + * On most systems, these ARE malloc and free. jpeg_free_small is passed the + * size of the object being freed, just in case it's needed. + */ + +EXTERN(void *) jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject); +EXTERN(void) jpeg_free_small (j_common_ptr cinfo, void *object, + size_t sizeofobject); + +/* + * These two functions are used to allocate and release large chunks of + * memory (up to the total free space designated by jpeg_mem_available). + * These are identical to the jpeg_get/free_small routines; but we keep them + * separate anyway, in case a different allocation strategy is desirable for + * large chunks. + */ + +EXTERN(void *) jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject); +EXTERN(void) jpeg_free_large (j_common_ptr cinfo, void *object, + size_t sizeofobject); + +/* + * The macro MAX_ALLOC_CHUNK designates the maximum number of bytes that may + * be requested in a single call to jpeg_get_large (and jpeg_get_small for that + * matter, but that case should never come into play). This macro was needed + * to model the 64Kb-segment-size limit of far addressing on 80x86 machines. + * On machines with flat address spaces, any large constant may be used. + * + * NB: jmemmgr.c expects that MAX_ALLOC_CHUNK will be representable as type + * size_t and will be a multiple of sizeof(align_type). + */ + +#ifndef MAX_ALLOC_CHUNK /* may be overridden in jconfig.h */ +#define MAX_ALLOC_CHUNK 1000000000L +#endif + +/* + * This routine computes the total space still available for allocation by + * jpeg_get_large. If more space than this is needed, backing store will be + * used. NOTE: any memory already allocated must not be counted. + * + * There is a minimum space requirement, corresponding to the minimum + * feasible buffer sizes; jmemmgr.c will request that much space even if + * jpeg_mem_available returns zero. The maximum space needed, enough to hold + * all working storage in memory, is also passed in case it is useful. + * Finally, the total space already allocated is passed. If no better + * method is available, cinfo->mem->max_memory_to_use - already_allocated + * is often a suitable calculation. + * + * It is OK for jpeg_mem_available to underestimate the space available + * (that'll just lead to more backing-store access than is really necessary). + * However, an overestimate will lead to failure. Hence it's wise to subtract + * a slop factor from the true available space. 5% should be enough. + * + * On machines with lots of virtual memory, any large constant may be returned. + * Conversely, zero may be returned to always use the minimum amount of memory. + */ + +EXTERN(size_t) jpeg_mem_available (j_common_ptr cinfo, size_t min_bytes_needed, + size_t max_bytes_needed, + size_t already_allocated); + + +/* + * This structure holds whatever state is needed to access a single + * backing-store object. The read/write/close method pointers are called + * by jmemmgr.c to manipulate the backing-store object; all other fields + * are private to the system-dependent backing store routines. + */ + +#define TEMP_NAME_LENGTH 64 /* max length of a temporary file's name */ + + +#ifdef USE_MSDOS_MEMMGR /* DOS-specific junk */ + +typedef unsigned short XMSH; /* type of extended-memory handles */ +typedef unsigned short EMSH; /* type of expanded-memory handles */ + +typedef union { + short file_handle; /* DOS file handle if it's a temp file */ + XMSH xms_handle; /* handle if it's a chunk of XMS */ + EMSH ems_handle; /* handle if it's a chunk of EMS */ +} handle_union; + +#endif /* USE_MSDOS_MEMMGR */ + +#ifdef USE_MAC_MEMMGR /* Mac-specific junk */ +#include <Files.h> +#endif /* USE_MAC_MEMMGR */ + + +typedef struct backing_store_struct *backing_store_ptr; + +typedef struct backing_store_struct { + /* Methods for reading/writing/closing this backing-store object */ + void (*read_backing_store) (j_common_ptr cinfo, backing_store_ptr info, + void *buffer_address, long file_offset, + long byte_count); + void (*write_backing_store) (j_common_ptr cinfo, backing_store_ptr info, + void *buffer_address, long file_offset, + long byte_count); + void (*close_backing_store) (j_common_ptr cinfo, backing_store_ptr info); + + /* Private fields for system-dependent backing-store management */ +#ifdef USE_MSDOS_MEMMGR + /* For the MS-DOS manager (jmemdos.c), we need: */ + handle_union handle; /* reference to backing-store storage object */ + char temp_name[TEMP_NAME_LENGTH]; /* name if it's a file */ +#else +#ifdef USE_MAC_MEMMGR + /* For the Mac manager (jmemmac.c), we need: */ + short temp_file; /* file reference number to temp file */ + FSSpec tempSpec; /* the FSSpec for the temp file */ + char temp_name[TEMP_NAME_LENGTH]; /* name if it's a file */ +#else + /* For a typical implementation with temp files, we need: */ + FILE *temp_file; /* stdio reference to temp file */ + char temp_name[TEMP_NAME_LENGTH]; /* name of temp file */ +#endif +#endif +} backing_store_info; + + +/* + * Initial opening of a backing-store object. This must fill in the + * read/write/close pointers in the object. The read/write routines + * may take an error exit if the specified maximum file size is exceeded. + * (If jpeg_mem_available always returns a large value, this routine can + * just take an error exit.) + */ + +EXTERN(void) jpeg_open_backing_store (j_common_ptr cinfo, + backing_store_ptr info, + long total_bytes_needed); + + +/* + * These routines take care of any system-dependent initialization and + * cleanup required. jpeg_mem_init will be called before anything is + * allocated (and, therefore, nothing in cinfo is of use except the error + * manager pointer). It should return a suitable default value for + * max_memory_to_use; this may subsequently be overridden by the surrounding + * application. (Note that max_memory_to_use is only important if + * jpeg_mem_available chooses to consult it ... no one else will.) + * jpeg_mem_term may assume that all requested memory has been freed and that + * all opened backing-store objects have been closed. + */ + +EXTERN(long) jpeg_mem_init (j_common_ptr cinfo); +EXTERN(void) jpeg_mem_term (j_common_ptr cinfo); diff --git a/media/libjpeg/jmorecfg.h b/media/libjpeg/jmorecfg.h new file mode 100644 index 000000000..d73b1090d --- /dev/null +++ b/media/libjpeg/jmorecfg.h @@ -0,0 +1,400 @@ +/* + * jmorecfg.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1997, Thomas G. Lane. + * Modified 1997-2009 by Guido Vollbeding. + * libjpeg-turbo Modifications: + * Copyright (C) 2009, 2011, 2014-2015, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains additional configuration options that customize the + * JPEG software for special applications or support machine-dependent + * optimizations. Most users will not need to touch this file. + */ + +#include <stdint.h> + +/* + * Maximum number of components (color channels) allowed in JPEG image. + * To meet the letter of the JPEG spec, set this to 255. However, darn + * few applications need more than 4 channels (maybe 5 for CMYK + alpha + * mask). We recommend 10 as a reasonable compromise; use 4 if you are + * really short on memory. (Each allowed component costs a hundred or so + * bytes of storage, whether actually used in an image or not.) + */ + +#define MAX_COMPONENTS 10 /* maximum number of image components */ + + +/* + * Basic data types. + * You may need to change these if you have a machine with unusual data + * type sizes; for example, "char" not 8 bits, "short" not 16 bits, + * or "long" not 32 bits. We don't care whether "int" is 16 or 32 bits, + * but it had better be at least 16. + */ + +/* Representation of a single sample (pixel element value). + * We frequently allocate large arrays of these, so it's important to keep + * them small. But if you have memory to burn and access to char or short + * arrays is very slow on your hardware, you might want to change these. + */ + +#if BITS_IN_JSAMPLE == 8 +/* JSAMPLE should be the smallest type that will hold the values 0..255. + * You can use a signed char by having GETJSAMPLE mask it with 0xFF. + */ + +#ifdef HAVE_UNSIGNED_CHAR + +typedef unsigned char JSAMPLE; +#define GETJSAMPLE(value) ((int) (value)) + +#else /* not HAVE_UNSIGNED_CHAR */ + +typedef char JSAMPLE; +#ifdef __CHAR_UNSIGNED__ +#define GETJSAMPLE(value) ((int) (value)) +#else +#define GETJSAMPLE(value) ((int) (value) & 0xFF) +#endif /* __CHAR_UNSIGNED__ */ + +#endif /* HAVE_UNSIGNED_CHAR */ + +#define MAXJSAMPLE 255 +#define CENTERJSAMPLE 128 + +#endif /* BITS_IN_JSAMPLE == 8 */ + + +#if BITS_IN_JSAMPLE == 12 +/* JSAMPLE should be the smallest type that will hold the values 0..4095. + * On nearly all machines "short" will do nicely. + */ + +typedef short JSAMPLE; +#define GETJSAMPLE(value) ((int) (value)) + +#define MAXJSAMPLE 4095 +#define CENTERJSAMPLE 2048 + +#endif /* BITS_IN_JSAMPLE == 12 */ + + +/* Representation of a DCT frequency coefficient. + * This should be a signed value of at least 16 bits; "short" is usually OK. + * Again, we allocate large arrays of these, but you can change to int + * if you have memory to burn and "short" is really slow. + */ + +typedef short JCOEF; + + +/* Compressed datastreams are represented as arrays of JOCTET. + * These must be EXACTLY 8 bits wide, at least once they are written to + * external storage. Note that when using the stdio data source/destination + * managers, this is also the data type passed to fread/fwrite. + */ + +#ifdef HAVE_UNSIGNED_CHAR + +typedef unsigned char JOCTET; +#define GETJOCTET(value) (value) + +#else /* not HAVE_UNSIGNED_CHAR */ + +typedef char JOCTET; +#ifdef __CHAR_UNSIGNED__ +#define GETJOCTET(value) (value) +#else +#define GETJOCTET(value) ((value) & 0xFF) +#endif /* __CHAR_UNSIGNED__ */ + +#endif /* HAVE_UNSIGNED_CHAR */ + + +/* These typedefs are used for various table entries and so forth. + * They must be at least as wide as specified; but making them too big + * won't cost a huge amount of memory, so we don't provide special + * extraction code like we did for JSAMPLE. (In other words, these + * typedefs live at a different point on the speed/space tradeoff curve.) + */ + +/* UINT8 must hold at least the values 0..255. */ + +typedef uint8_t UINT8; + +/* UINT16 must hold at least the values 0..65535. */ + +typedef uint16_t UINT16; + +/* INT16 must hold at least the values -32768..32767. */ + +typedef int16_t INT16; + +/* INT32 must hold at least signed 32-bit values. + * + * NOTE: The INT32 typedef dates back to libjpeg v5 (1994.) Integers were + * sometimes 16-bit back then (MS-DOS), which is why INT32 is typedef'd to + * long. It also wasn't common (or at least as common) in 1994 for INT32 to be + * defined by platform headers. Since then, however, INT32 is defined in + * several other common places: + * + * Xmd.h (X11 header) typedefs INT32 to int on 64-bit platforms and long on + * 32-bit platforms (i.e always a 32-bit signed type.) + * + * basetsd.h (Win32 header) typedefs INT32 to int (always a 32-bit signed type + * on modern platforms.) + * + * qglobal.h (Qt header) typedefs INT32 to int (always a 32-bit signed type on + * modern platforms.) + * + * This is a recipe for conflict, since "long" and "int" aren't always + * compatible types. Since the definition of INT32 has technically been part + * of the libjpeg API for more than 20 years, we can't remove it, but we do not + * use it internally any longer. We instead define a separate type (JLONG) + * for internal use, which ensures that internal behavior will always be the + * same regardless of any external headers that may be included. + */ + +typedef int32_t INT32; + +/* Datatype used for image dimensions. The JPEG standard only supports + * images up to 64K*64K due to 16-bit fields in SOF markers. Therefore + * "unsigned int" is sufficient on all machines. However, if you need to + * handle larger images and you don't mind deviating from the spec, you + * can change this datatype. (Note that changing this datatype will + * potentially require modifying the SIMD code. The x86-64 SIMD extensions, + * in particular, assume a 32-bit JDIMENSION.) + */ + +typedef unsigned int JDIMENSION; + +#define JPEG_MAX_DIMENSION 65500L /* a tad under 64K to prevent overflows */ + + +/* These macros are used in all function definitions and extern declarations. + * You could modify them if you need to change function linkage conventions; + * in particular, you'll need to do that to make the library a Windows DLL. + * Another application is to make all functions global for use with debuggers + * or code profilers that require it. + */ + +/* a function called through method pointers: */ +#define METHODDEF(type) static type +/* a function used only in its module: */ +#define LOCAL(type) static type +/* a function referenced thru EXTERNs: */ +#define GLOBAL(type) type +/* a reference to a GLOBAL function: */ +#define EXTERN(type) extern type + + +/* Originally, this macro was used as a way of defining function prototypes + * for both modern compilers as well as older compilers that did not support + * prototype parameters. libjpeg-turbo has never supported these older, + * non-ANSI compilers, but the macro is still included because there is some + * software out there that uses it. + */ + +#define JMETHOD(type,methodname,arglist) type (*methodname) arglist + + +/* libjpeg-turbo no longer supports platforms that have far symbols (MS-DOS), + * but again, some software relies on this macro. + */ + +#undef FAR +#define FAR + + +/* + * On a few systems, type boolean and/or its values FALSE, TRUE may appear + * in standard header files. Or you may have conflicts with application- + * specific header files that you want to include together with these files. + * Defining HAVE_BOOLEAN before including jpeglib.h should make it work. + */ + +#ifndef HAVE_BOOLEAN +typedef int boolean; +#endif +#ifndef FALSE /* in case these macros already exist */ +#define FALSE 0 /* values of boolean */ +#endif +#ifndef TRUE +#define TRUE 1 +#endif + + +/* + * The remaining options affect code selection within the JPEG library, + * but they don't need to be visible to most applications using the library. + * To minimize application namespace pollution, the symbols won't be + * defined unless JPEG_INTERNALS or JPEG_INTERNAL_OPTIONS has been defined. + */ + +#ifdef JPEG_INTERNALS +#define JPEG_INTERNAL_OPTIONS +#endif + +#ifdef JPEG_INTERNAL_OPTIONS + + +/* + * These defines indicate whether to include various optional functions. + * Undefining some of these symbols will produce a smaller but less capable + * library. Note that you can leave certain source files out of the + * compilation/linking process if you've #undef'd the corresponding symbols. + * (You may HAVE to do that if your compiler doesn't like null source files.) + */ + +/* Capability options common to encoder and decoder: */ + +#define DCT_ISLOW_SUPPORTED /* slow but accurate integer algorithm */ +#define DCT_IFAST_SUPPORTED /* faster, less accurate integer method */ +#define DCT_FLOAT_SUPPORTED /* floating-point: accurate, fast on fast HW */ + +/* Encoder capability options: */ + +#define C_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */ +#define C_PROGRESSIVE_SUPPORTED /* Progressive JPEG? (Requires MULTISCAN)*/ +#define ENTROPY_OPT_SUPPORTED /* Optimization of entropy coding parms? */ +/* Note: if you selected 12-bit data precision, it is dangerous to turn off + * ENTROPY_OPT_SUPPORTED. The standard Huffman tables are only good for 8-bit + * precision, so jchuff.c normally uses entropy optimization to compute + * usable tables for higher precision. If you don't want to do optimization, + * you'll have to supply different default Huffman tables. + * The exact same statements apply for progressive JPEG: the default tables + * don't work for progressive mode. (This may get fixed, however.) + */ +#define INPUT_SMOOTHING_SUPPORTED /* Input image smoothing option? */ + +/* Decoder capability options: */ + +#define D_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */ +#define D_PROGRESSIVE_SUPPORTED /* Progressive JPEG? (Requires MULTISCAN)*/ +#define SAVE_MARKERS_SUPPORTED /* jpeg_save_markers() needed? */ +#define BLOCK_SMOOTHING_SUPPORTED /* Block smoothing? (Progressive only) */ +#define IDCT_SCALING_SUPPORTED /* Output rescaling via IDCT? */ +#undef UPSAMPLE_SCALING_SUPPORTED /* Output rescaling at upsample stage? */ +#define UPSAMPLE_MERGING_SUPPORTED /* Fast path for sloppy upsampling? */ +#define QUANT_1PASS_SUPPORTED /* 1-pass color quantization? */ +#define QUANT_2PASS_SUPPORTED /* 2-pass color quantization? */ + +/* more capability options later, no doubt */ + + +/* + * The RGB_RED, RGB_GREEN, RGB_BLUE, and RGB_PIXELSIZE macros are a vestigial + * feature of libjpeg. The idea was that, if an application developer needed + * to compress from/decompress to a BGR/BGRX/RGBX/XBGR/XRGB buffer, they could + * change these macros, rebuild libjpeg, and link their application statically + * with it. In reality, few people ever did this, because there were some + * severe restrictions involved (cjpeg and djpeg no longer worked properly, + * compressing/decompressing RGB JPEGs no longer worked properly, and the color + * quantizer wouldn't work with pixel sizes other than 3.) Further, since all + * of the O/S-supplied versions of libjpeg were built with the default values + * of RGB_RED, RGB_GREEN, RGB_BLUE, and RGB_PIXELSIZE, many applications have + * come to regard these values as immutable. + * + * The libjpeg-turbo colorspace extensions provide a much cleaner way of + * compressing from/decompressing to buffers with arbitrary component orders + * and pixel sizes. Thus, we do not support changing the values of RGB_RED, + * RGB_GREEN, RGB_BLUE, or RGB_PIXELSIZE. In addition to the restrictions + * listed above, changing these values will also break the SIMD extensions and + * the regression tests. + */ + +#define RGB_RED 0 /* Offset of Red in an RGB scanline element */ +#define RGB_GREEN 1 /* Offset of Green */ +#define RGB_BLUE 2 /* Offset of Blue */ +#define RGB_PIXELSIZE 3 /* JSAMPLEs per RGB scanline element */ + +#define JPEG_NUMCS 17 + +#define EXT_RGB_RED 0 +#define EXT_RGB_GREEN 1 +#define EXT_RGB_BLUE 2 +#define EXT_RGB_PIXELSIZE 3 + +#define EXT_RGBX_RED 0 +#define EXT_RGBX_GREEN 1 +#define EXT_RGBX_BLUE 2 +#define EXT_RGBX_PIXELSIZE 4 + +#define EXT_BGR_RED 2 +#define EXT_BGR_GREEN 1 +#define EXT_BGR_BLUE 0 +#define EXT_BGR_PIXELSIZE 3 + +#define EXT_BGRX_RED 2 +#define EXT_BGRX_GREEN 1 +#define EXT_BGRX_BLUE 0 +#define EXT_BGRX_PIXELSIZE 4 + +#define EXT_XBGR_RED 3 +#define EXT_XBGR_GREEN 2 +#define EXT_XBGR_BLUE 1 +#define EXT_XBGR_PIXELSIZE 4 + +#define EXT_XRGB_RED 1 +#define EXT_XRGB_GREEN 2 +#define EXT_XRGB_BLUE 3 +#define EXT_XRGB_PIXELSIZE 4 + +static const int rgb_red[JPEG_NUMCS] = { + -1, -1, RGB_RED, -1, -1, -1, EXT_RGB_RED, EXT_RGBX_RED, + EXT_BGR_RED, EXT_BGRX_RED, EXT_XBGR_RED, EXT_XRGB_RED, + EXT_RGBX_RED, EXT_BGRX_RED, EXT_XBGR_RED, EXT_XRGB_RED, + -1 +}; + +static const int rgb_green[JPEG_NUMCS] = { + -1, -1, RGB_GREEN, -1, -1, -1, EXT_RGB_GREEN, EXT_RGBX_GREEN, + EXT_BGR_GREEN, EXT_BGRX_GREEN, EXT_XBGR_GREEN, EXT_XRGB_GREEN, + EXT_RGBX_GREEN, EXT_BGRX_GREEN, EXT_XBGR_GREEN, EXT_XRGB_GREEN, + -1 +}; + +static const int rgb_blue[JPEG_NUMCS] = { + -1, -1, RGB_BLUE, -1, -1, -1, EXT_RGB_BLUE, EXT_RGBX_BLUE, + EXT_BGR_BLUE, EXT_BGRX_BLUE, EXT_XBGR_BLUE, EXT_XRGB_BLUE, + EXT_RGBX_BLUE, EXT_BGRX_BLUE, EXT_XBGR_BLUE, EXT_XRGB_BLUE, + -1 +}; + +static const int rgb_pixelsize[JPEG_NUMCS] = { + -1, -1, RGB_PIXELSIZE, -1, -1, -1, EXT_RGB_PIXELSIZE, EXT_RGBX_PIXELSIZE, + EXT_BGR_PIXELSIZE, EXT_BGRX_PIXELSIZE, EXT_XBGR_PIXELSIZE, EXT_XRGB_PIXELSIZE, + EXT_RGBX_PIXELSIZE, EXT_BGRX_PIXELSIZE, EXT_XBGR_PIXELSIZE, EXT_XRGB_PIXELSIZE, + -1 +}; + +/* Definitions for speed-related optimizations. */ + +/* On some machines (notably 68000 series) "int" is 32 bits, but multiplying + * two 16-bit shorts is faster than multiplying two ints. Define MULTIPLIER + * as short on such a machine. MULTIPLIER must be at least 16 bits wide. + */ + +#ifndef MULTIPLIER +#ifndef WITH_SIMD +#define MULTIPLIER int /* type for fastest integer multiply */ +#else +#define MULTIPLIER short /* prefer 16-bit with SIMD for parellelism */ +#endif +#endif + + +/* FAST_FLOAT should be either float or double, whichever is done faster + * by your compiler. (Note that this type is only used in the floating point + * DCT routines, so it only matters if you've defined DCT_FLOAT_SUPPORTED.) + */ + +#ifndef FAST_FLOAT +#define FAST_FLOAT float +#endif + +#endif /* JPEG_INTERNAL_OPTIONS */ diff --git a/media/libjpeg/jpeg_nbits_table.h b/media/libjpeg/jpeg_nbits_table.h new file mode 100644 index 000000000..fcf73878c --- /dev/null +++ b/media/libjpeg/jpeg_nbits_table.h @@ -0,0 +1,4098 @@ +static const unsigned char jpeg_nbits_table[65536] = { + 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 +}; diff --git a/media/libjpeg/jpegcomp.h b/media/libjpeg/jpegcomp.h new file mode 100644 index 000000000..ade0d1edc --- /dev/null +++ b/media/libjpeg/jpegcomp.h @@ -0,0 +1,31 @@ +/* + * jpegcomp.h + * + * Copyright (C) 2010, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * JPEG compatibility macros + * These declarations are considered internal to the JPEG library; most + * applications using the library shouldn't need to include this file. + */ + +#if JPEG_LIB_VERSION >= 70 +#define _DCT_scaled_size DCT_h_scaled_size +#define _DCT_h_scaled_size DCT_h_scaled_size +#define _DCT_v_scaled_size DCT_v_scaled_size +#define _min_DCT_scaled_size min_DCT_h_scaled_size +#define _min_DCT_h_scaled_size min_DCT_h_scaled_size +#define _min_DCT_v_scaled_size min_DCT_v_scaled_size +#define _jpeg_width jpeg_width +#define _jpeg_height jpeg_height +#else +#define _DCT_scaled_size DCT_scaled_size +#define _DCT_h_scaled_size DCT_scaled_size +#define _DCT_v_scaled_size DCT_scaled_size +#define _min_DCT_scaled_size min_DCT_scaled_size +#define _min_DCT_h_scaled_size min_DCT_scaled_size +#define _min_DCT_v_scaled_size min_DCT_scaled_size +#define _jpeg_width image_width +#define _jpeg_height image_height +#endif diff --git a/media/libjpeg/jpegint.h b/media/libjpeg/jpegint.h new file mode 100644 index 000000000..9979a912d --- /dev/null +++ b/media/libjpeg/jpegint.h @@ -0,0 +1,368 @@ +/* + * jpegint.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1997, Thomas G. Lane. + * Modified 1997-2009 by Guido Vollbeding. + * libjpeg-turbo Modifications: + * Copyright (C) 2015-2016, D. R. Commander. + * Copyright (C) 2015, Google, Inc. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file provides common declarations for the various JPEG modules. + * These declarations are considered internal to the JPEG library; most + * applications using the library shouldn't need to include this file. + */ + + +/* Declarations for both compression & decompression */ + +typedef enum { /* Operating modes for buffer controllers */ + JBUF_PASS_THRU, /* Plain stripwise operation */ + /* Remaining modes require a full-image buffer to have been created */ + JBUF_SAVE_SOURCE, /* Run source subobject only, save output */ + JBUF_CRANK_DEST, /* Run dest subobject only, using saved data */ + JBUF_SAVE_AND_PASS /* Run both subobjects, save output */ +} J_BUF_MODE; + +/* Values of global_state field (jdapi.c has some dependencies on ordering!) */ +#define CSTATE_START 100 /* after create_compress */ +#define CSTATE_SCANNING 101 /* start_compress done, write_scanlines OK */ +#define CSTATE_RAW_OK 102 /* start_compress done, write_raw_data OK */ +#define CSTATE_WRCOEFS 103 /* jpeg_write_coefficients done */ +#define DSTATE_START 200 /* after create_decompress */ +#define DSTATE_INHEADER 201 /* reading header markers, no SOS yet */ +#define DSTATE_READY 202 /* found SOS, ready for start_decompress */ +#define DSTATE_PRELOAD 203 /* reading multiscan file in start_decompress*/ +#define DSTATE_PRESCAN 204 /* performing dummy pass for 2-pass quant */ +#define DSTATE_SCANNING 205 /* start_decompress done, read_scanlines OK */ +#define DSTATE_RAW_OK 206 /* start_decompress done, read_raw_data OK */ +#define DSTATE_BUFIMAGE 207 /* expecting jpeg_start_output */ +#define DSTATE_BUFPOST 208 /* looking for SOS/EOI in jpeg_finish_output */ +#define DSTATE_RDCOEFS 209 /* reading file in jpeg_read_coefficients */ +#define DSTATE_STOPPING 210 /* looking for EOI in jpeg_finish_decompress */ + + +/* JLONG must hold at least signed 32-bit values. */ +typedef long JLONG; + + +/* + * Left shift macro that handles a negative operand without causing any + * sanitizer warnings + */ + +#define LEFT_SHIFT(a, b) ((JLONG)((unsigned long)(a) << (b))) + + +/* Declarations for compression modules */ + +/* Master control module */ +struct jpeg_comp_master { + void (*prepare_for_pass) (j_compress_ptr cinfo); + void (*pass_startup) (j_compress_ptr cinfo); + void (*finish_pass) (j_compress_ptr cinfo); + + /* State variables made visible to other modules */ + boolean call_pass_startup; /* True if pass_startup must be called */ + boolean is_last_pass; /* True during last pass */ +}; + +/* Main buffer control (downsampled-data buffer) */ +struct jpeg_c_main_controller { + void (*start_pass) (j_compress_ptr cinfo, J_BUF_MODE pass_mode); + void (*process_data) (j_compress_ptr cinfo, JSAMPARRAY input_buf, + JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail); +}; + +/* Compression preprocessing (downsampling input buffer control) */ +struct jpeg_c_prep_controller { + void (*start_pass) (j_compress_ptr cinfo, J_BUF_MODE pass_mode); + void (*pre_process_data) (j_compress_ptr cinfo, JSAMPARRAY input_buf, + JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail, + JSAMPIMAGE output_buf, + JDIMENSION *out_row_group_ctr, + JDIMENSION out_row_groups_avail); +}; + +/* Coefficient buffer control */ +struct jpeg_c_coef_controller { + void (*start_pass) (j_compress_ptr cinfo, J_BUF_MODE pass_mode); + boolean (*compress_data) (j_compress_ptr cinfo, JSAMPIMAGE input_buf); +}; + +/* Colorspace conversion */ +struct jpeg_color_converter { + void (*start_pass) (j_compress_ptr cinfo); + void (*color_convert) (j_compress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, JDIMENSION output_row, + int num_rows); +}; + +/* Downsampling */ +struct jpeg_downsampler { + void (*start_pass) (j_compress_ptr cinfo); + void (*downsample) (j_compress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_index, JSAMPIMAGE output_buf, + JDIMENSION out_row_group_index); + + boolean need_context_rows; /* TRUE if need rows above & below */ +}; + +/* Forward DCT (also controls coefficient quantization) */ +struct jpeg_forward_dct { + void (*start_pass) (j_compress_ptr cinfo); + /* perhaps this should be an array??? */ + void (*forward_DCT) (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY sample_data, JBLOCKROW coef_blocks, + JDIMENSION start_row, JDIMENSION start_col, + JDIMENSION num_blocks); +}; + +/* Entropy encoding */ +struct jpeg_entropy_encoder { + void (*start_pass) (j_compress_ptr cinfo, boolean gather_statistics); + boolean (*encode_mcu) (j_compress_ptr cinfo, JBLOCKROW *MCU_data); + void (*finish_pass) (j_compress_ptr cinfo); +}; + +/* Marker writing */ +struct jpeg_marker_writer { + void (*write_file_header) (j_compress_ptr cinfo); + void (*write_frame_header) (j_compress_ptr cinfo); + void (*write_scan_header) (j_compress_ptr cinfo); + void (*write_file_trailer) (j_compress_ptr cinfo); + void (*write_tables_only) (j_compress_ptr cinfo); + /* These routines are exported to allow insertion of extra markers */ + /* Probably only COM and APPn markers should be written this way */ + void (*write_marker_header) (j_compress_ptr cinfo, int marker, + unsigned int datalen); + void (*write_marker_byte) (j_compress_ptr cinfo, int val); +}; + + +/* Declarations for decompression modules */ + +/* Master control module */ +struct jpeg_decomp_master { + void (*prepare_for_output_pass) (j_decompress_ptr cinfo); + void (*finish_output_pass) (j_decompress_ptr cinfo); + + /* State variables made visible to other modules */ + boolean is_dummy_pass; /* True during 1st pass for 2-pass quant */ + + /* Partial decompression variables */ + JDIMENSION first_iMCU_col; + JDIMENSION last_iMCU_col; + JDIMENSION first_MCU_col[MAX_COMPONENTS]; + JDIMENSION last_MCU_col[MAX_COMPONENTS]; + boolean jinit_upsampler_no_alloc; +}; + +/* Input control module */ +struct jpeg_input_controller { + int (*consume_input) (j_decompress_ptr cinfo); + void (*reset_input_controller) (j_decompress_ptr cinfo); + void (*start_input_pass) (j_decompress_ptr cinfo); + void (*finish_input_pass) (j_decompress_ptr cinfo); + + /* State variables made visible to other modules */ + boolean has_multiple_scans; /* True if file has multiple scans */ + boolean eoi_reached; /* True when EOI has been consumed */ +}; + +/* Main buffer control (downsampled-data buffer) */ +struct jpeg_d_main_controller { + void (*start_pass) (j_decompress_ptr cinfo, J_BUF_MODE pass_mode); + void (*process_data) (j_decompress_ptr cinfo, JSAMPARRAY output_buf, + JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail); +}; + +/* Coefficient buffer control */ +struct jpeg_d_coef_controller { + void (*start_input_pass) (j_decompress_ptr cinfo); + int (*consume_data) (j_decompress_ptr cinfo); + void (*start_output_pass) (j_decompress_ptr cinfo); + int (*decompress_data) (j_decompress_ptr cinfo, JSAMPIMAGE output_buf); + /* Pointer to array of coefficient virtual arrays, or NULL if none */ + jvirt_barray_ptr *coef_arrays; +}; + +/* Decompression postprocessing (color quantization buffer control) */ +struct jpeg_d_post_controller { + void (*start_pass) (j_decompress_ptr cinfo, J_BUF_MODE pass_mode); + void (*post_process_data) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION *in_row_group_ctr, + JDIMENSION in_row_groups_avail, + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail); +}; + +/* Marker reading & parsing */ +struct jpeg_marker_reader { + void (*reset_marker_reader) (j_decompress_ptr cinfo); + /* Read markers until SOS or EOI. + * Returns same codes as are defined for jpeg_consume_input: + * JPEG_SUSPENDED, JPEG_REACHED_SOS, or JPEG_REACHED_EOI. + */ + int (*read_markers) (j_decompress_ptr cinfo); + /* Read a restart marker --- exported for use by entropy decoder only */ + jpeg_marker_parser_method read_restart_marker; + + /* State of marker reader --- nominally internal, but applications + * supplying COM or APPn handlers might like to know the state. + */ + boolean saw_SOI; /* found SOI? */ + boolean saw_SOF; /* found SOF? */ + int next_restart_num; /* next restart number expected (0-7) */ + unsigned int discarded_bytes; /* # of bytes skipped looking for a marker */ +}; + +/* Entropy decoding */ +struct jpeg_entropy_decoder { + void (*start_pass) (j_decompress_ptr cinfo); + boolean (*decode_mcu) (j_decompress_ptr cinfo, JBLOCKROW *MCU_data); + + /* This is here to share code between baseline and progressive decoders; */ + /* other modules probably should not use it */ + boolean insufficient_data; /* set TRUE after emitting warning */ +}; + +/* Inverse DCT (also performs dequantization) */ +typedef void (*inverse_DCT_method_ptr) (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, + JDIMENSION output_col); + +struct jpeg_inverse_dct { + void (*start_pass) (j_decompress_ptr cinfo); + /* It is useful to allow each component to have a separate IDCT method. */ + inverse_DCT_method_ptr inverse_DCT[MAX_COMPONENTS]; +}; + +/* Upsampling (note that upsampler must also call color converter) */ +struct jpeg_upsampler { + void (*start_pass) (j_decompress_ptr cinfo); + void (*upsample) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION *in_row_group_ctr, + JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf, + JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail); + + boolean need_context_rows; /* TRUE if need rows above & below */ +}; + +/* Colorspace conversion */ +struct jpeg_color_deconverter { + void (*start_pass) (j_decompress_ptr cinfo); + void (*color_convert) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION input_row, JSAMPARRAY output_buf, + int num_rows); +}; + +/* Color quantization or color precision reduction */ +struct jpeg_color_quantizer { + void (*start_pass) (j_decompress_ptr cinfo, boolean is_pre_scan); + void (*color_quantize) (j_decompress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPARRAY output_buf, int num_rows); + void (*finish_pass) (j_decompress_ptr cinfo); + void (*new_color_map) (j_decompress_ptr cinfo); +}; + + +/* Miscellaneous useful macros */ + +#undef MAX +#define MAX(a,b) ((a) > (b) ? (a) : (b)) +#undef MIN +#define MIN(a,b) ((a) < (b) ? (a) : (b)) + + +/* We assume that right shift corresponds to signed division by 2 with + * rounding towards minus infinity. This is correct for typical "arithmetic + * shift" instructions that shift in copies of the sign bit. But some + * C compilers implement >> with an unsigned shift. For these machines you + * must define RIGHT_SHIFT_IS_UNSIGNED. + * RIGHT_SHIFT provides a proper signed right shift of a JLONG quantity. + * It is only applied with constant shift counts. SHIFT_TEMPS must be + * included in the variables of any routine using RIGHT_SHIFT. + */ + +#ifdef RIGHT_SHIFT_IS_UNSIGNED +#define SHIFT_TEMPS JLONG shift_temp; +#define RIGHT_SHIFT(x,shft) \ + ((shift_temp = (x)) < 0 ? \ + (shift_temp >> (shft)) | ((~((JLONG) 0)) << (32-(shft))) : \ + (shift_temp >> (shft))) +#else +#define SHIFT_TEMPS +#define RIGHT_SHIFT(x,shft) ((x) >> (shft)) +#endif + + +/* Compression module initialization routines */ +EXTERN(void) jinit_compress_master (j_compress_ptr cinfo); +EXTERN(void) jinit_c_master_control (j_compress_ptr cinfo, + boolean transcode_only); +EXTERN(void) jinit_c_main_controller (j_compress_ptr cinfo, + boolean need_full_buffer); +EXTERN(void) jinit_c_prep_controller (j_compress_ptr cinfo, + boolean need_full_buffer); +EXTERN(void) jinit_c_coef_controller (j_compress_ptr cinfo, + boolean need_full_buffer); +EXTERN(void) jinit_color_converter (j_compress_ptr cinfo); +EXTERN(void) jinit_downsampler (j_compress_ptr cinfo); +EXTERN(void) jinit_forward_dct (j_compress_ptr cinfo); +EXTERN(void) jinit_huff_encoder (j_compress_ptr cinfo); +EXTERN(void) jinit_phuff_encoder (j_compress_ptr cinfo); +EXTERN(void) jinit_arith_encoder (j_compress_ptr cinfo); +EXTERN(void) jinit_marker_writer (j_compress_ptr cinfo); +/* Decompression module initialization routines */ +EXTERN(void) jinit_master_decompress (j_decompress_ptr cinfo); +EXTERN(void) jinit_d_main_controller (j_decompress_ptr cinfo, + boolean need_full_buffer); +EXTERN(void) jinit_d_coef_controller (j_decompress_ptr cinfo, + boolean need_full_buffer); +EXTERN(void) jinit_d_post_controller (j_decompress_ptr cinfo, + boolean need_full_buffer); +EXTERN(void) jinit_input_controller (j_decompress_ptr cinfo); +EXTERN(void) jinit_marker_reader (j_decompress_ptr cinfo); +EXTERN(void) jinit_huff_decoder (j_decompress_ptr cinfo); +EXTERN(void) jinit_phuff_decoder (j_decompress_ptr cinfo); +EXTERN(void) jinit_arith_decoder (j_decompress_ptr cinfo); +EXTERN(void) jinit_inverse_dct (j_decompress_ptr cinfo); +EXTERN(void) jinit_upsampler (j_decompress_ptr cinfo); +EXTERN(void) jinit_color_deconverter (j_decompress_ptr cinfo); +EXTERN(void) jinit_1pass_quantizer (j_decompress_ptr cinfo); +EXTERN(void) jinit_2pass_quantizer (j_decompress_ptr cinfo); +EXTERN(void) jinit_merged_upsampler (j_decompress_ptr cinfo); +/* Memory manager initialization */ +EXTERN(void) jinit_memory_mgr (j_common_ptr cinfo); + +/* Utility routines in jutils.c */ +EXTERN(long) jdiv_round_up (long a, long b); +EXTERN(long) jround_up (long a, long b); +EXTERN(void) jcopy_sample_rows (JSAMPARRAY input_array, int source_row, + JSAMPARRAY output_array, int dest_row, + int num_rows, JDIMENSION num_cols); +EXTERN(void) jcopy_block_row (JBLOCKROW input_row, JBLOCKROW output_row, + JDIMENSION num_blocks); +EXTERN(void) jzero_far (void *target, size_t bytestozero); +/* Constant tables in jutils.c */ +#if 0 /* This table is not actually needed in v6a */ +extern const int jpeg_zigzag_order[]; /* natural coef order to zigzag order */ +#endif +extern const int jpeg_natural_order[]; /* zigzag coef order to natural order */ + +/* Arithmetic coding probability estimation tables in jaricom.c */ +extern const JLONG jpeg_aritab[]; + +/* Suppress undefined-structure complaints if necessary. */ + +#ifdef INCOMPLETE_TYPES_BROKEN +#ifndef AM_MEMORY_MANAGER /* only jmemmgr.c defines these */ +struct jvirt_sarray_control { long dummy; }; +struct jvirt_barray_control { long dummy; }; +#endif +#endif /* INCOMPLETE_TYPES_BROKEN */ diff --git a/media/libjpeg/jpeglib.h b/media/libjpeg/jpeglib.h new file mode 100644 index 000000000..6c63f5822 --- /dev/null +++ b/media/libjpeg/jpeglib.h @@ -0,0 +1,1122 @@ +/* + * jpeglib.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1998, Thomas G. Lane. + * Modified 2002-2009 by Guido Vollbeding. + * libjpeg-turbo Modifications: + * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander. + * Copyright (C) 2015, Google, Inc. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file defines the application interface for the JPEG library. + * Most applications using the library need only include this file, + * and perhaps jerror.h if they want to know the exact error codes. + */ + +#ifndef JPEGLIB_H +#define JPEGLIB_H + +/* + * First we include the configuration files that record how this + * installation of the JPEG library is set up. jconfig.h can be + * generated automatically for many systems. jmorecfg.h contains + * manual configuration options that most people need not worry about. + */ + +#ifndef JCONFIG_INCLUDED /* in case jinclude.h already did */ +#include "jconfig.h" /* widely used configuration options */ +#endif +#include "jmorecfg.h" /* seldom changed options */ + + +#ifdef __cplusplus +#ifndef DONT_USE_EXTERN_C +extern "C" { +#endif +#endif + + +/* Various constants determining the sizes of things. + * All of these are specified by the JPEG standard, so don't change them + * if you want to be compatible. + */ + +#define DCTSIZE 8 /* The basic DCT block is 8x8 samples */ +#define DCTSIZE2 64 /* DCTSIZE squared; # of elements in a block */ +#define NUM_QUANT_TBLS 4 /* Quantization tables are numbered 0..3 */ +#define NUM_HUFF_TBLS 4 /* Huffman tables are numbered 0..3 */ +#define NUM_ARITH_TBLS 16 /* Arith-coding tables are numbered 0..15 */ +#define MAX_COMPS_IN_SCAN 4 /* JPEG limit on # of components in one scan */ +#define MAX_SAMP_FACTOR 4 /* JPEG limit on sampling factors */ +/* Unfortunately, some bozo at Adobe saw no reason to be bound by the standard; + * the PostScript DCT filter can emit files with many more than 10 blocks/MCU. + * If you happen to run across such a file, you can up D_MAX_BLOCKS_IN_MCU + * to handle it. We even let you do this from the jconfig.h file. However, + * we strongly discourage changing C_MAX_BLOCKS_IN_MCU; just because Adobe + * sometimes emits noncompliant files doesn't mean you should too. + */ +#define C_MAX_BLOCKS_IN_MCU 10 /* compressor's limit on blocks per MCU */ +#ifndef D_MAX_BLOCKS_IN_MCU +#define D_MAX_BLOCKS_IN_MCU 10 /* decompressor's limit on blocks per MCU */ +#endif + + +/* Data structures for images (arrays of samples and of DCT coefficients). + */ + +typedef JSAMPLE *JSAMPROW; /* ptr to one image row of pixel samples. */ +typedef JSAMPROW *JSAMPARRAY; /* ptr to some rows (a 2-D sample array) */ +typedef JSAMPARRAY *JSAMPIMAGE; /* a 3-D sample array: top index is color */ + +typedef JCOEF JBLOCK[DCTSIZE2]; /* one block of coefficients */ +typedef JBLOCK *JBLOCKROW; /* pointer to one row of coefficient blocks */ +typedef JBLOCKROW *JBLOCKARRAY; /* a 2-D array of coefficient blocks */ +typedef JBLOCKARRAY *JBLOCKIMAGE; /* a 3-D array of coefficient blocks */ + +typedef JCOEF *JCOEFPTR; /* useful in a couple of places */ + + +/* Types for JPEG compression parameters and working tables. */ + + +/* DCT coefficient quantization tables. */ + +typedef struct { + /* This array gives the coefficient quantizers in natural array order + * (not the zigzag order in which they are stored in a JPEG DQT marker). + * CAUTION: IJG versions prior to v6a kept this array in zigzag order. + */ + UINT16 quantval[DCTSIZE2]; /* quantization step for each coefficient */ + /* This field is used only during compression. It's initialized FALSE when + * the table is created, and set TRUE when it's been output to the file. + * You could suppress output of a table by setting this to TRUE. + * (See jpeg_suppress_tables for an example.) + */ + boolean sent_table; /* TRUE when table has been output */ +} JQUANT_TBL; + + +/* Huffman coding tables. */ + +typedef struct { + /* These two fields directly represent the contents of a JPEG DHT marker */ + UINT8 bits[17]; /* bits[k] = # of symbols with codes of */ + /* length k bits; bits[0] is unused */ + UINT8 huffval[256]; /* The symbols, in order of incr code length */ + /* This field is used only during compression. It's initialized FALSE when + * the table is created, and set TRUE when it's been output to the file. + * You could suppress output of a table by setting this to TRUE. + * (See jpeg_suppress_tables for an example.) + */ + boolean sent_table; /* TRUE when table has been output */ +} JHUFF_TBL; + + +/* Basic info about one component (color channel). */ + +typedef struct { + /* These values are fixed over the whole image. */ + /* For compression, they must be supplied by parameter setup; */ + /* for decompression, they are read from the SOF marker. */ + int component_id; /* identifier for this component (0..255) */ + int component_index; /* its index in SOF or cinfo->comp_info[] */ + int h_samp_factor; /* horizontal sampling factor (1..4) */ + int v_samp_factor; /* vertical sampling factor (1..4) */ + int quant_tbl_no; /* quantization table selector (0..3) */ + /* These values may vary between scans. */ + /* For compression, they must be supplied by parameter setup; */ + /* for decompression, they are read from the SOS marker. */ + /* The decompressor output side may not use these variables. */ + int dc_tbl_no; /* DC entropy table selector (0..3) */ + int ac_tbl_no; /* AC entropy table selector (0..3) */ + + /* Remaining fields should be treated as private by applications. */ + + /* These values are computed during compression or decompression startup: */ + /* Component's size in DCT blocks. + * Any dummy blocks added to complete an MCU are not counted; therefore + * these values do not depend on whether a scan is interleaved or not. + */ + JDIMENSION width_in_blocks; + JDIMENSION height_in_blocks; + /* Size of a DCT block in samples. Always DCTSIZE for compression. + * For decompression this is the size of the output from one DCT block, + * reflecting any scaling we choose to apply during the IDCT step. + * Values from 1 to 16 are supported. + * Note that different components may receive different IDCT scalings. + */ +#if JPEG_LIB_VERSION >= 70 + int DCT_h_scaled_size; + int DCT_v_scaled_size; +#else + int DCT_scaled_size; +#endif + /* The downsampled dimensions are the component's actual, unpadded number + * of samples at the main buffer (preprocessing/compression interface), thus + * downsampled_width = ceil(image_width * Hi/Hmax) + * and similarly for height. For decompression, IDCT scaling is included, so + * downsampled_width = ceil(image_width * Hi/Hmax * DCT_[h_]scaled_size/DCTSIZE) + */ + JDIMENSION downsampled_width; /* actual width in samples */ + JDIMENSION downsampled_height; /* actual height in samples */ + /* This flag is used only for decompression. In cases where some of the + * components will be ignored (eg grayscale output from YCbCr image), + * we can skip most computations for the unused components. + */ + boolean component_needed; /* do we need the value of this component? */ + + /* These values are computed before starting a scan of the component. */ + /* The decompressor output side may not use these variables. */ + int MCU_width; /* number of blocks per MCU, horizontally */ + int MCU_height; /* number of blocks per MCU, vertically */ + int MCU_blocks; /* MCU_width * MCU_height */ + int MCU_sample_width; /* MCU width in samples, MCU_width*DCT_[h_]scaled_size */ + int last_col_width; /* # of non-dummy blocks across in last MCU */ + int last_row_height; /* # of non-dummy blocks down in last MCU */ + + /* Saved quantization table for component; NULL if none yet saved. + * See jdinput.c comments about the need for this information. + * This field is currently used only for decompression. + */ + JQUANT_TBL *quant_table; + + /* Private per-component storage for DCT or IDCT subsystem. */ + void *dct_table; +} jpeg_component_info; + + +/* The script for encoding a multiple-scan file is an array of these: */ + +typedef struct { + int comps_in_scan; /* number of components encoded in this scan */ + int component_index[MAX_COMPS_IN_SCAN]; /* their SOF/comp_info[] indexes */ + int Ss, Se; /* progressive JPEG spectral selection parms */ + int Ah, Al; /* progressive JPEG successive approx. parms */ +} jpeg_scan_info; + +/* The decompressor can save APPn and COM markers in a list of these: */ + +typedef struct jpeg_marker_struct *jpeg_saved_marker_ptr; + +struct jpeg_marker_struct { + jpeg_saved_marker_ptr next; /* next in list, or NULL */ + UINT8 marker; /* marker code: JPEG_COM, or JPEG_APP0+n */ + unsigned int original_length; /* # bytes of data in the file */ + unsigned int data_length; /* # bytes of data saved at data[] */ + JOCTET *data; /* the data contained in the marker */ + /* the marker length word is not counted in data_length or original_length */ +}; + +/* Known color spaces. */ + +#define JCS_EXTENSIONS 1 +#define JCS_ALPHA_EXTENSIONS 1 + +typedef enum { + JCS_UNKNOWN, /* error/unspecified */ + JCS_GRAYSCALE, /* monochrome */ + JCS_RGB, /* red/green/blue as specified by the RGB_RED, + RGB_GREEN, RGB_BLUE, and RGB_PIXELSIZE macros */ + JCS_YCbCr, /* Y/Cb/Cr (also known as YUV) */ + JCS_CMYK, /* C/M/Y/K */ + JCS_YCCK, /* Y/Cb/Cr/K */ + JCS_EXT_RGB, /* red/green/blue */ + JCS_EXT_RGBX, /* red/green/blue/x */ + JCS_EXT_BGR, /* blue/green/red */ + JCS_EXT_BGRX, /* blue/green/red/x */ + JCS_EXT_XBGR, /* x/blue/green/red */ + JCS_EXT_XRGB, /* x/red/green/blue */ + /* When out_color_space it set to JCS_EXT_RGBX, JCS_EXT_BGRX, JCS_EXT_XBGR, + or JCS_EXT_XRGB during decompression, the X byte is undefined, and in + order to ensure the best performance, libjpeg-turbo can set that byte to + whatever value it wishes. Use the following colorspace constants to + ensure that the X byte is set to 0xFF, so that it can be interpreted as an + opaque alpha channel. */ + JCS_EXT_RGBA, /* red/green/blue/alpha */ + JCS_EXT_BGRA, /* blue/green/red/alpha */ + JCS_EXT_ABGR, /* alpha/blue/green/red */ + JCS_EXT_ARGB, /* alpha/red/green/blue */ + JCS_RGB565 /* 5-bit red/6-bit green/5-bit blue */ +} J_COLOR_SPACE; + +/* DCT/IDCT algorithm options. */ + +typedef enum { + JDCT_ISLOW, /* slow but accurate integer algorithm */ + JDCT_IFAST, /* faster, less accurate integer method */ + JDCT_FLOAT /* floating-point: accurate, fast on fast HW */ +} J_DCT_METHOD; + +#ifndef JDCT_DEFAULT /* may be overridden in jconfig.h */ +#define JDCT_DEFAULT JDCT_ISLOW +#endif +#ifndef JDCT_FASTEST /* may be overridden in jconfig.h */ +#define JDCT_FASTEST JDCT_IFAST +#endif + +/* Dithering options for decompression. */ + +typedef enum { + JDITHER_NONE, /* no dithering */ + JDITHER_ORDERED, /* simple ordered dither */ + JDITHER_FS /* Floyd-Steinberg error diffusion dither */ +} J_DITHER_MODE; + + +/* Common fields between JPEG compression and decompression master structs. */ + +#define jpeg_common_fields \ + struct jpeg_error_mgr *err; /* Error handler module */\ + struct jpeg_memory_mgr *mem; /* Memory manager module */\ + struct jpeg_progress_mgr *progress; /* Progress monitor, or NULL if none */\ + void *client_data; /* Available for use by application */\ + boolean is_decompressor; /* So common code can tell which is which */\ + int global_state /* For checking call sequence validity */ + +/* Routines that are to be used by both halves of the library are declared + * to receive a pointer to this structure. There are no actual instances of + * jpeg_common_struct, only of jpeg_compress_struct and jpeg_decompress_struct. + */ +struct jpeg_common_struct { + jpeg_common_fields; /* Fields common to both master struct types */ + /* Additional fields follow in an actual jpeg_compress_struct or + * jpeg_decompress_struct. All three structs must agree on these + * initial fields! (This would be a lot cleaner in C++.) + */ +}; + +typedef struct jpeg_common_struct *j_common_ptr; +typedef struct jpeg_compress_struct *j_compress_ptr; +typedef struct jpeg_decompress_struct *j_decompress_ptr; + + +/* Master record for a compression instance */ + +struct jpeg_compress_struct { + jpeg_common_fields; /* Fields shared with jpeg_decompress_struct */ + + /* Destination for compressed data */ + struct jpeg_destination_mgr *dest; + + /* Description of source image --- these fields must be filled in by + * outer application before starting compression. in_color_space must + * be correct before you can even call jpeg_set_defaults(). + */ + + JDIMENSION image_width; /* input image width */ + JDIMENSION image_height; /* input image height */ + int input_components; /* # of color components in input image */ + J_COLOR_SPACE in_color_space; /* colorspace of input image */ + + double input_gamma; /* image gamma of input image */ + + /* Compression parameters --- these fields must be set before calling + * jpeg_start_compress(). We recommend calling jpeg_set_defaults() to + * initialize everything to reasonable defaults, then changing anything + * the application specifically wants to change. That way you won't get + * burnt when new parameters are added. Also note that there are several + * helper routines to simplify changing parameters. + */ + +#if JPEG_LIB_VERSION >= 70 + unsigned int scale_num, scale_denom; /* fraction by which to scale image */ + + JDIMENSION jpeg_width; /* scaled JPEG image width */ + JDIMENSION jpeg_height; /* scaled JPEG image height */ + /* Dimensions of actual JPEG image that will be written to file, + * derived from input dimensions by scaling factors above. + * These fields are computed by jpeg_start_compress(). + * You can also use jpeg_calc_jpeg_dimensions() to determine these values + * in advance of calling jpeg_start_compress(). + */ +#endif + + int data_precision; /* bits of precision in image data */ + + int num_components; /* # of color components in JPEG image */ + J_COLOR_SPACE jpeg_color_space; /* colorspace of JPEG image */ + + jpeg_component_info *comp_info; + /* comp_info[i] describes component that appears i'th in SOF */ + + JQUANT_TBL *quant_tbl_ptrs[NUM_QUANT_TBLS]; +#if JPEG_LIB_VERSION >= 70 + int q_scale_factor[NUM_QUANT_TBLS]; +#endif + /* ptrs to coefficient quantization tables, or NULL if not defined, + * and corresponding scale factors (percentage, initialized 100). + */ + + JHUFF_TBL *dc_huff_tbl_ptrs[NUM_HUFF_TBLS]; + JHUFF_TBL *ac_huff_tbl_ptrs[NUM_HUFF_TBLS]; + /* ptrs to Huffman coding tables, or NULL if not defined */ + + UINT8 arith_dc_L[NUM_ARITH_TBLS]; /* L values for DC arith-coding tables */ + UINT8 arith_dc_U[NUM_ARITH_TBLS]; /* U values for DC arith-coding tables */ + UINT8 arith_ac_K[NUM_ARITH_TBLS]; /* Kx values for AC arith-coding tables */ + + int num_scans; /* # of entries in scan_info array */ + const jpeg_scan_info *scan_info; /* script for multi-scan file, or NULL */ + /* The default value of scan_info is NULL, which causes a single-scan + * sequential JPEG file to be emitted. To create a multi-scan file, + * set num_scans and scan_info to point to an array of scan definitions. + */ + + boolean raw_data_in; /* TRUE=caller supplies downsampled data */ + boolean arith_code; /* TRUE=arithmetic coding, FALSE=Huffman */ + boolean optimize_coding; /* TRUE=optimize entropy encoding parms */ + boolean CCIR601_sampling; /* TRUE=first samples are cosited */ +#if JPEG_LIB_VERSION >= 70 + boolean do_fancy_downsampling; /* TRUE=apply fancy downsampling */ +#endif + int smoothing_factor; /* 1..100, or 0 for no input smoothing */ + J_DCT_METHOD dct_method; /* DCT algorithm selector */ + + /* The restart interval can be specified in absolute MCUs by setting + * restart_interval, or in MCU rows by setting restart_in_rows + * (in which case the correct restart_interval will be figured + * for each scan). + */ + unsigned int restart_interval; /* MCUs per restart, or 0 for no restart */ + int restart_in_rows; /* if > 0, MCU rows per restart interval */ + + /* Parameters controlling emission of special markers. */ + + boolean write_JFIF_header; /* should a JFIF marker be written? */ + UINT8 JFIF_major_version; /* What to write for the JFIF version number */ + UINT8 JFIF_minor_version; + /* These three values are not used by the JPEG code, merely copied */ + /* into the JFIF APP0 marker. density_unit can be 0 for unknown, */ + /* 1 for dots/inch, or 2 for dots/cm. Note that the pixel aspect */ + /* ratio is defined by X_density/Y_density even when density_unit=0. */ + UINT8 density_unit; /* JFIF code for pixel size units */ + UINT16 X_density; /* Horizontal pixel density */ + UINT16 Y_density; /* Vertical pixel density */ + boolean write_Adobe_marker; /* should an Adobe marker be written? */ + + /* State variable: index of next scanline to be written to + * jpeg_write_scanlines(). Application may use this to control its + * processing loop, e.g., "while (next_scanline < image_height)". + */ + + JDIMENSION next_scanline; /* 0 .. image_height-1 */ + + /* Remaining fields are known throughout compressor, but generally + * should not be touched by a surrounding application. + */ + + /* + * These fields are computed during compression startup + */ + boolean progressive_mode; /* TRUE if scan script uses progressive mode */ + int max_h_samp_factor; /* largest h_samp_factor */ + int max_v_samp_factor; /* largest v_samp_factor */ + +#if JPEG_LIB_VERSION >= 70 + int min_DCT_h_scaled_size; /* smallest DCT_h_scaled_size of any component */ + int min_DCT_v_scaled_size; /* smallest DCT_v_scaled_size of any component */ +#endif + + JDIMENSION total_iMCU_rows; /* # of iMCU rows to be input to coef ctlr */ + /* The coefficient controller receives data in units of MCU rows as defined + * for fully interleaved scans (whether the JPEG file is interleaved or not). + * There are v_samp_factor * DCTSIZE sample rows of each component in an + * "iMCU" (interleaved MCU) row. + */ + + /* + * These fields are valid during any one scan. + * They describe the components and MCUs actually appearing in the scan. + */ + int comps_in_scan; /* # of JPEG components in this scan */ + jpeg_component_info *cur_comp_info[MAX_COMPS_IN_SCAN]; + /* *cur_comp_info[i] describes component that appears i'th in SOS */ + + JDIMENSION MCUs_per_row; /* # of MCUs across the image */ + JDIMENSION MCU_rows_in_scan; /* # of MCU rows in the image */ + + int blocks_in_MCU; /* # of DCT blocks per MCU */ + int MCU_membership[C_MAX_BLOCKS_IN_MCU]; + /* MCU_membership[i] is index in cur_comp_info of component owning */ + /* i'th block in an MCU */ + + int Ss, Se, Ah, Al; /* progressive JPEG parameters for scan */ + +#if JPEG_LIB_VERSION >= 80 + int block_size; /* the basic DCT block size: 1..16 */ + const int *natural_order; /* natural-order position array */ + int lim_Se; /* min( Se, DCTSIZE2-1 ) */ +#endif + + /* + * Links to compression subobjects (methods and private variables of modules) + */ + struct jpeg_comp_master *master; + struct jpeg_c_main_controller *main; + struct jpeg_c_prep_controller *prep; + struct jpeg_c_coef_controller *coef; + struct jpeg_marker_writer *marker; + struct jpeg_color_converter *cconvert; + struct jpeg_downsampler *downsample; + struct jpeg_forward_dct *fdct; + struct jpeg_entropy_encoder *entropy; + jpeg_scan_info *script_space; /* workspace for jpeg_simple_progression */ + int script_space_size; +}; + + +/* Master record for a decompression instance */ + +struct jpeg_decompress_struct { + jpeg_common_fields; /* Fields shared with jpeg_compress_struct */ + + /* Source of compressed data */ + struct jpeg_source_mgr *src; + + /* Basic description of image --- filled in by jpeg_read_header(). */ + /* Application may inspect these values to decide how to process image. */ + + JDIMENSION image_width; /* nominal image width (from SOF marker) */ + JDIMENSION image_height; /* nominal image height */ + int num_components; /* # of color components in JPEG image */ + J_COLOR_SPACE jpeg_color_space; /* colorspace of JPEG image */ + + /* Decompression processing parameters --- these fields must be set before + * calling jpeg_start_decompress(). Note that jpeg_read_header() initializes + * them to default values. + */ + + J_COLOR_SPACE out_color_space; /* colorspace for output */ + + unsigned int scale_num, scale_denom; /* fraction by which to scale image */ + + double output_gamma; /* image gamma wanted in output */ + + boolean buffered_image; /* TRUE=multiple output passes */ + boolean raw_data_out; /* TRUE=downsampled data wanted */ + + J_DCT_METHOD dct_method; /* IDCT algorithm selector */ + boolean do_fancy_upsampling; /* TRUE=apply fancy upsampling */ + boolean do_block_smoothing; /* TRUE=apply interblock smoothing */ + + boolean quantize_colors; /* TRUE=colormapped output wanted */ + /* the following are ignored if not quantize_colors: */ + J_DITHER_MODE dither_mode; /* type of color dithering to use */ + boolean two_pass_quantize; /* TRUE=use two-pass color quantization */ + int desired_number_of_colors; /* max # colors to use in created colormap */ + /* these are significant only in buffered-image mode: */ + boolean enable_1pass_quant; /* enable future use of 1-pass quantizer */ + boolean enable_external_quant;/* enable future use of external colormap */ + boolean enable_2pass_quant; /* enable future use of 2-pass quantizer */ + + /* Description of actual output image that will be returned to application. + * These fields are computed by jpeg_start_decompress(). + * You can also use jpeg_calc_output_dimensions() to determine these values + * in advance of calling jpeg_start_decompress(). + */ + + JDIMENSION output_width; /* scaled image width */ + JDIMENSION output_height; /* scaled image height */ + int out_color_components; /* # of color components in out_color_space */ + int output_components; /* # of color components returned */ + /* output_components is 1 (a colormap index) when quantizing colors; + * otherwise it equals out_color_components. + */ + int rec_outbuf_height; /* min recommended height of scanline buffer */ + /* If the buffer passed to jpeg_read_scanlines() is less than this many rows + * high, space and time will be wasted due to unnecessary data copying. + * Usually rec_outbuf_height will be 1 or 2, at most 4. + */ + + /* When quantizing colors, the output colormap is described by these fields. + * The application can supply a colormap by setting colormap non-NULL before + * calling jpeg_start_decompress; otherwise a colormap is created during + * jpeg_start_decompress or jpeg_start_output. + * The map has out_color_components rows and actual_number_of_colors columns. + */ + int actual_number_of_colors; /* number of entries in use */ + JSAMPARRAY colormap; /* The color map as a 2-D pixel array */ + + /* State variables: these variables indicate the progress of decompression. + * The application may examine these but must not modify them. + */ + + /* Row index of next scanline to be read from jpeg_read_scanlines(). + * Application may use this to control its processing loop, e.g., + * "while (output_scanline < output_height)". + */ + JDIMENSION output_scanline; /* 0 .. output_height-1 */ + + /* Current input scan number and number of iMCU rows completed in scan. + * These indicate the progress of the decompressor input side. + */ + int input_scan_number; /* Number of SOS markers seen so far */ + JDIMENSION input_iMCU_row; /* Number of iMCU rows completed */ + + /* The "output scan number" is the notional scan being displayed by the + * output side. The decompressor will not allow output scan/row number + * to get ahead of input scan/row, but it can fall arbitrarily far behind. + */ + int output_scan_number; /* Nominal scan number being displayed */ + JDIMENSION output_iMCU_row; /* Number of iMCU rows read */ + + /* Current progression status. coef_bits[c][i] indicates the precision + * with which component c's DCT coefficient i (in zigzag order) is known. + * It is -1 when no data has yet been received, otherwise it is the point + * transform (shift) value for the most recent scan of the coefficient + * (thus, 0 at completion of the progression). + * This pointer is NULL when reading a non-progressive file. + */ + int (*coef_bits)[DCTSIZE2]; /* -1 or current Al value for each coef */ + + /* Internal JPEG parameters --- the application usually need not look at + * these fields. Note that the decompressor output side may not use + * any parameters that can change between scans. + */ + + /* Quantization and Huffman tables are carried forward across input + * datastreams when processing abbreviated JPEG datastreams. + */ + + JQUANT_TBL *quant_tbl_ptrs[NUM_QUANT_TBLS]; + /* ptrs to coefficient quantization tables, or NULL if not defined */ + + JHUFF_TBL *dc_huff_tbl_ptrs[NUM_HUFF_TBLS]; + JHUFF_TBL *ac_huff_tbl_ptrs[NUM_HUFF_TBLS]; + /* ptrs to Huffman coding tables, or NULL if not defined */ + + /* These parameters are never carried across datastreams, since they + * are given in SOF/SOS markers or defined to be reset by SOI. + */ + + int data_precision; /* bits of precision in image data */ + + jpeg_component_info *comp_info; + /* comp_info[i] describes component that appears i'th in SOF */ + +#if JPEG_LIB_VERSION >= 80 + boolean is_baseline; /* TRUE if Baseline SOF0 encountered */ +#endif + boolean progressive_mode; /* TRUE if SOFn specifies progressive mode */ + boolean arith_code; /* TRUE=arithmetic coding, FALSE=Huffman */ + + UINT8 arith_dc_L[NUM_ARITH_TBLS]; /* L values for DC arith-coding tables */ + UINT8 arith_dc_U[NUM_ARITH_TBLS]; /* U values for DC arith-coding tables */ + UINT8 arith_ac_K[NUM_ARITH_TBLS]; /* Kx values for AC arith-coding tables */ + + unsigned int restart_interval; /* MCUs per restart interval, or 0 for no restart */ + + /* These fields record data obtained from optional markers recognized by + * the JPEG library. + */ + boolean saw_JFIF_marker; /* TRUE iff a JFIF APP0 marker was found */ + /* Data copied from JFIF marker; only valid if saw_JFIF_marker is TRUE: */ + UINT8 JFIF_major_version; /* JFIF version number */ + UINT8 JFIF_minor_version; + UINT8 density_unit; /* JFIF code for pixel size units */ + UINT16 X_density; /* Horizontal pixel density */ + UINT16 Y_density; /* Vertical pixel density */ + boolean saw_Adobe_marker; /* TRUE iff an Adobe APP14 marker was found */ + UINT8 Adobe_transform; /* Color transform code from Adobe marker */ + + boolean CCIR601_sampling; /* TRUE=first samples are cosited */ + + /* Aside from the specific data retained from APPn markers known to the + * library, the uninterpreted contents of any or all APPn and COM markers + * can be saved in a list for examination by the application. + */ + jpeg_saved_marker_ptr marker_list; /* Head of list of saved markers */ + + /* Remaining fields are known throughout decompressor, but generally + * should not be touched by a surrounding application. + */ + + /* + * These fields are computed during decompression startup + */ + int max_h_samp_factor; /* largest h_samp_factor */ + int max_v_samp_factor; /* largest v_samp_factor */ + +#if JPEG_LIB_VERSION >= 70 + int min_DCT_h_scaled_size; /* smallest DCT_h_scaled_size of any component */ + int min_DCT_v_scaled_size; /* smallest DCT_v_scaled_size of any component */ +#else + int min_DCT_scaled_size; /* smallest DCT_scaled_size of any component */ +#endif + + JDIMENSION total_iMCU_rows; /* # of iMCU rows in image */ + /* The coefficient controller's input and output progress is measured in + * units of "iMCU" (interleaved MCU) rows. These are the same as MCU rows + * in fully interleaved JPEG scans, but are used whether the scan is + * interleaved or not. We define an iMCU row as v_samp_factor DCT block + * rows of each component. Therefore, the IDCT output contains + * v_samp_factor*DCT_[v_]scaled_size sample rows of a component per iMCU row. + */ + + JSAMPLE *sample_range_limit; /* table for fast range-limiting */ + + /* + * These fields are valid during any one scan. + * They describe the components and MCUs actually appearing in the scan. + * Note that the decompressor output side must not use these fields. + */ + int comps_in_scan; /* # of JPEG components in this scan */ + jpeg_component_info *cur_comp_info[MAX_COMPS_IN_SCAN]; + /* *cur_comp_info[i] describes component that appears i'th in SOS */ + + JDIMENSION MCUs_per_row; /* # of MCUs across the image */ + JDIMENSION MCU_rows_in_scan; /* # of MCU rows in the image */ + + int blocks_in_MCU; /* # of DCT blocks per MCU */ + int MCU_membership[D_MAX_BLOCKS_IN_MCU]; + /* MCU_membership[i] is index in cur_comp_info of component owning */ + /* i'th block in an MCU */ + + int Ss, Se, Ah, Al; /* progressive JPEG parameters for scan */ + +#if JPEG_LIB_VERSION >= 80 + /* These fields are derived from Se of first SOS marker. + */ + int block_size; /* the basic DCT block size: 1..16 */ + const int *natural_order; /* natural-order position array for entropy decode */ + int lim_Se; /* min( Se, DCTSIZE2-1 ) for entropy decode */ +#endif + + /* This field is shared between entropy decoder and marker parser. + * It is either zero or the code of a JPEG marker that has been + * read from the data source, but has not yet been processed. + */ + int unread_marker; + + /* + * Links to decompression subobjects (methods, private variables of modules) + */ + struct jpeg_decomp_master *master; + struct jpeg_d_main_controller *main; + struct jpeg_d_coef_controller *coef; + struct jpeg_d_post_controller *post; + struct jpeg_input_controller *inputctl; + struct jpeg_marker_reader *marker; + struct jpeg_entropy_decoder *entropy; + struct jpeg_inverse_dct *idct; + struct jpeg_upsampler *upsample; + struct jpeg_color_deconverter *cconvert; + struct jpeg_color_quantizer *cquantize; +}; + + +/* "Object" declarations for JPEG modules that may be supplied or called + * directly by the surrounding application. + * As with all objects in the JPEG library, these structs only define the + * publicly visible methods and state variables of a module. Additional + * private fields may exist after the public ones. + */ + + +/* Error handler object */ + +struct jpeg_error_mgr { + /* Error exit handler: does not return to caller */ + void (*error_exit) (j_common_ptr cinfo); + /* Conditionally emit a trace or warning message */ + void (*emit_message) (j_common_ptr cinfo, int msg_level); + /* Routine that actually outputs a trace or error message */ + void (*output_message) (j_common_ptr cinfo); + /* Format a message string for the most recent JPEG error or message */ + void (*format_message) (j_common_ptr cinfo, char *buffer); +#define JMSG_LENGTH_MAX 200 /* recommended size of format_message buffer */ + /* Reset error state variables at start of a new image */ + void (*reset_error_mgr) (j_common_ptr cinfo); + + /* The message ID code and any parameters are saved here. + * A message can have one string parameter or up to 8 int parameters. + */ + int msg_code; +#define JMSG_STR_PARM_MAX 80 + union { + int i[8]; + char s[JMSG_STR_PARM_MAX]; + } msg_parm; + + /* Standard state variables for error facility */ + + int trace_level; /* max msg_level that will be displayed */ + + /* For recoverable corrupt-data errors, we emit a warning message, + * but keep going unless emit_message chooses to abort. emit_message + * should count warnings in num_warnings. The surrounding application + * can check for bad data by seeing if num_warnings is nonzero at the + * end of processing. + */ + long num_warnings; /* number of corrupt-data warnings */ + + /* These fields point to the table(s) of error message strings. + * An application can change the table pointer to switch to a different + * message list (typically, to change the language in which errors are + * reported). Some applications may wish to add additional error codes + * that will be handled by the JPEG library error mechanism; the second + * table pointer is used for this purpose. + * + * First table includes all errors generated by JPEG library itself. + * Error code 0 is reserved for a "no such error string" message. + */ + const char * const *jpeg_message_table; /* Library errors */ + int last_jpeg_message; /* Table contains strings 0..last_jpeg_message */ + /* Second table can be added by application (see cjpeg/djpeg for example). + * It contains strings numbered first_addon_message..last_addon_message. + */ + const char * const *addon_message_table; /* Non-library errors */ + int first_addon_message; /* code for first string in addon table */ + int last_addon_message; /* code for last string in addon table */ +}; + + +/* Progress monitor object */ + +struct jpeg_progress_mgr { + void (*progress_monitor) (j_common_ptr cinfo); + + long pass_counter; /* work units completed in this pass */ + long pass_limit; /* total number of work units in this pass */ + int completed_passes; /* passes completed so far */ + int total_passes; /* total number of passes expected */ +}; + + +/* Data destination object for compression */ + +struct jpeg_destination_mgr { + JOCTET *next_output_byte; /* => next byte to write in buffer */ + size_t free_in_buffer; /* # of byte spaces remaining in buffer */ + + void (*init_destination) (j_compress_ptr cinfo); + boolean (*empty_output_buffer) (j_compress_ptr cinfo); + void (*term_destination) (j_compress_ptr cinfo); +}; + + +/* Data source object for decompression */ + +struct jpeg_source_mgr { + const JOCTET *next_input_byte; /* => next byte to read from buffer */ + size_t bytes_in_buffer; /* # of bytes remaining in buffer */ + + void (*init_source) (j_decompress_ptr cinfo); + boolean (*fill_input_buffer) (j_decompress_ptr cinfo); + void (*skip_input_data) (j_decompress_ptr cinfo, long num_bytes); + boolean (*resync_to_restart) (j_decompress_ptr cinfo, int desired); + void (*term_source) (j_decompress_ptr cinfo); +}; + + +/* Memory manager object. + * Allocates "small" objects (a few K total), "large" objects (tens of K), + * and "really big" objects (virtual arrays with backing store if needed). + * The memory manager does not allow individual objects to be freed; rather, + * each created object is assigned to a pool, and whole pools can be freed + * at once. This is faster and more convenient than remembering exactly what + * to free, especially where malloc()/free() are not too speedy. + * NB: alloc routines never return NULL. They exit to error_exit if not + * successful. + */ + +#define JPOOL_PERMANENT 0 /* lasts until master record is destroyed */ +#define JPOOL_IMAGE 1 /* lasts until done with image/datastream */ +#define JPOOL_NUMPOOLS 2 + +typedef struct jvirt_sarray_control *jvirt_sarray_ptr; +typedef struct jvirt_barray_control *jvirt_barray_ptr; + + +struct jpeg_memory_mgr { + /* Method pointers */ + void *(*alloc_small) (j_common_ptr cinfo, int pool_id, size_t sizeofobject); + void *(*alloc_large) (j_common_ptr cinfo, int pool_id, + size_t sizeofobject); + JSAMPARRAY (*alloc_sarray) (j_common_ptr cinfo, int pool_id, + JDIMENSION samplesperrow, JDIMENSION numrows); + JBLOCKARRAY (*alloc_barray) (j_common_ptr cinfo, int pool_id, + JDIMENSION blocksperrow, JDIMENSION numrows); + jvirt_sarray_ptr (*request_virt_sarray) (j_common_ptr cinfo, int pool_id, + boolean pre_zero, + JDIMENSION samplesperrow, + JDIMENSION numrows, + JDIMENSION maxaccess); + jvirt_barray_ptr (*request_virt_barray) (j_common_ptr cinfo, int pool_id, + boolean pre_zero, + JDIMENSION blocksperrow, + JDIMENSION numrows, + JDIMENSION maxaccess); + void (*realize_virt_arrays) (j_common_ptr cinfo); + JSAMPARRAY (*access_virt_sarray) (j_common_ptr cinfo, jvirt_sarray_ptr ptr, + JDIMENSION start_row, JDIMENSION num_rows, + boolean writable); + JBLOCKARRAY (*access_virt_barray) (j_common_ptr cinfo, jvirt_barray_ptr ptr, + JDIMENSION start_row, JDIMENSION num_rows, + boolean writable); + void (*free_pool) (j_common_ptr cinfo, int pool_id); + void (*self_destruct) (j_common_ptr cinfo); + + /* Limit on memory allocation for this JPEG object. (Note that this is + * merely advisory, not a guaranteed maximum; it only affects the space + * used for virtual-array buffers.) May be changed by outer application + * after creating the JPEG object. + */ + long max_memory_to_use; + + /* Maximum allocation request accepted by alloc_large. */ + long max_alloc_chunk; +}; + + +/* Routine signature for application-supplied marker processing methods. + * Need not pass marker code since it is stored in cinfo->unread_marker. + */ +typedef boolean (*jpeg_marker_parser_method) (j_decompress_ptr cinfo); + + +/* Originally, this macro was used as a way of defining function prototypes + * for both modern compilers as well as older compilers that did not support + * prototype parameters. libjpeg-turbo has never supported these older, + * non-ANSI compilers, but the macro is still included because there is some + * software out there that uses it. + */ + +#define JPP(arglist) arglist + + +/* Default error-management setup */ +EXTERN(struct jpeg_error_mgr *) jpeg_std_error (struct jpeg_error_mgr *err); + +/* Initialization of JPEG compression objects. + * jpeg_create_compress() and jpeg_create_decompress() are the exported + * names that applications should call. These expand to calls on + * jpeg_CreateCompress and jpeg_CreateDecompress with additional information + * passed for version mismatch checking. + * NB: you must set up the error-manager BEFORE calling jpeg_create_xxx. + */ +#define jpeg_create_compress(cinfo) \ + jpeg_CreateCompress((cinfo), JPEG_LIB_VERSION, \ + (size_t) sizeof(struct jpeg_compress_struct)) +#define jpeg_create_decompress(cinfo) \ + jpeg_CreateDecompress((cinfo), JPEG_LIB_VERSION, \ + (size_t) sizeof(struct jpeg_decompress_struct)) +EXTERN(void) jpeg_CreateCompress (j_compress_ptr cinfo, int version, + size_t structsize); +EXTERN(void) jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, + size_t structsize); +/* Destruction of JPEG compression objects */ +EXTERN(void) jpeg_destroy_compress (j_compress_ptr cinfo); +EXTERN(void) jpeg_destroy_decompress (j_decompress_ptr cinfo); + +/* Standard data source and destination managers: stdio streams. */ +/* Caller is responsible for opening the file before and closing after. */ +EXTERN(void) jpeg_stdio_dest (j_compress_ptr cinfo, FILE *outfile); +EXTERN(void) jpeg_stdio_src (j_decompress_ptr cinfo, FILE *infile); + +#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED) +/* Data source and destination managers: memory buffers. */ +EXTERN(void) jpeg_mem_dest (j_compress_ptr cinfo, unsigned char **outbuffer, + unsigned long *outsize); +EXTERN(void) jpeg_mem_src (j_decompress_ptr cinfo, + const unsigned char *inbuffer, + unsigned long insize); +#endif + +/* Default parameter setup for compression */ +EXTERN(void) jpeg_set_defaults (j_compress_ptr cinfo); +/* Compression parameter setup aids */ +EXTERN(void) jpeg_set_colorspace (j_compress_ptr cinfo, + J_COLOR_SPACE colorspace); +EXTERN(void) jpeg_default_colorspace (j_compress_ptr cinfo); +EXTERN(void) jpeg_set_quality (j_compress_ptr cinfo, int quality, + boolean force_baseline); +EXTERN(void) jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor, + boolean force_baseline); +#if JPEG_LIB_VERSION >= 70 +EXTERN(void) jpeg_default_qtables (j_compress_ptr cinfo, + boolean force_baseline); +#endif +EXTERN(void) jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl, + const unsigned int *basic_table, + int scale_factor, boolean force_baseline); +EXTERN(int) jpeg_quality_scaling (int quality); +EXTERN(void) jpeg_simple_progression (j_compress_ptr cinfo); +EXTERN(void) jpeg_suppress_tables (j_compress_ptr cinfo, boolean suppress); +EXTERN(JQUANT_TBL *) jpeg_alloc_quant_table (j_common_ptr cinfo); +EXTERN(JHUFF_TBL *) jpeg_alloc_huff_table (j_common_ptr cinfo); + +/* Main entry points for compression */ +EXTERN(void) jpeg_start_compress (j_compress_ptr cinfo, + boolean write_all_tables); +EXTERN(JDIMENSION) jpeg_write_scanlines (j_compress_ptr cinfo, + JSAMPARRAY scanlines, + JDIMENSION num_lines); +EXTERN(void) jpeg_finish_compress (j_compress_ptr cinfo); + +#if JPEG_LIB_VERSION >= 70 +/* Precalculate JPEG dimensions for current compression parameters. */ +EXTERN(void) jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo); +#endif + +/* Replaces jpeg_write_scanlines when writing raw downsampled data. */ +EXTERN(JDIMENSION) jpeg_write_raw_data (j_compress_ptr cinfo, JSAMPIMAGE data, + JDIMENSION num_lines); + +/* Write a special marker. See libjpeg.txt concerning safe usage. */ +EXTERN(void) jpeg_write_marker (j_compress_ptr cinfo, int marker, + const JOCTET *dataptr, unsigned int datalen); +/* Same, but piecemeal. */ +EXTERN(void) jpeg_write_m_header (j_compress_ptr cinfo, int marker, + unsigned int datalen); +EXTERN(void) jpeg_write_m_byte (j_compress_ptr cinfo, int val); + +/* Alternate compression function: just write an abbreviated table file */ +EXTERN(void) jpeg_write_tables (j_compress_ptr cinfo); + +/* Decompression startup: read start of JPEG datastream to see what's there */ +EXTERN(int) jpeg_read_header (j_decompress_ptr cinfo, boolean require_image); +/* Return value is one of: */ +#define JPEG_SUSPENDED 0 /* Suspended due to lack of input data */ +#define JPEG_HEADER_OK 1 /* Found valid image datastream */ +#define JPEG_HEADER_TABLES_ONLY 2 /* Found valid table-specs-only datastream */ +/* If you pass require_image = TRUE (normal case), you need not check for + * a TABLES_ONLY return code; an abbreviated file will cause an error exit. + * JPEG_SUSPENDED is only possible if you use a data source module that can + * give a suspension return (the stdio source module doesn't). + */ + +/* Main entry points for decompression */ +EXTERN(boolean) jpeg_start_decompress (j_decompress_ptr cinfo); +EXTERN(JDIMENSION) jpeg_read_scanlines (j_decompress_ptr cinfo, + JSAMPARRAY scanlines, + JDIMENSION max_lines); +EXTERN(JDIMENSION) jpeg_skip_scanlines (j_decompress_ptr cinfo, + JDIMENSION num_lines); +EXTERN(void) jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset, + JDIMENSION *width); +EXTERN(boolean) jpeg_finish_decompress (j_decompress_ptr cinfo); + +/* Replaces jpeg_read_scanlines when reading raw downsampled data. */ +EXTERN(JDIMENSION) jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data, + JDIMENSION max_lines); + +/* Additional entry points for buffered-image mode. */ +EXTERN(boolean) jpeg_has_multiple_scans (j_decompress_ptr cinfo); +EXTERN(boolean) jpeg_start_output (j_decompress_ptr cinfo, int scan_number); +EXTERN(boolean) jpeg_finish_output (j_decompress_ptr cinfo); +EXTERN(boolean) jpeg_input_complete (j_decompress_ptr cinfo); +EXTERN(void) jpeg_new_colormap (j_decompress_ptr cinfo); +EXTERN(int) jpeg_consume_input (j_decompress_ptr cinfo); +/* Return value is one of: */ +/* #define JPEG_SUSPENDED 0 Suspended due to lack of input data */ +#define JPEG_REACHED_SOS 1 /* Reached start of new scan */ +#define JPEG_REACHED_EOI 2 /* Reached end of image */ +#define JPEG_ROW_COMPLETED 3 /* Completed one iMCU row */ +#define JPEG_SCAN_COMPLETED 4 /* Completed last iMCU row of a scan */ + +/* Precalculate output dimensions for current decompression parameters. */ +#if JPEG_LIB_VERSION >= 80 +EXTERN(void) jpeg_core_output_dimensions (j_decompress_ptr cinfo); +#endif +EXTERN(void) jpeg_calc_output_dimensions (j_decompress_ptr cinfo); + +/* Control saving of COM and APPn markers into marker_list. */ +EXTERN(void) jpeg_save_markers (j_decompress_ptr cinfo, int marker_code, + unsigned int length_limit); + +/* Install a special processing method for COM or APPn markers. */ +EXTERN(void) jpeg_set_marker_processor (j_decompress_ptr cinfo, + int marker_code, + jpeg_marker_parser_method routine); + +/* Read or write raw DCT coefficients --- useful for lossless transcoding. */ +EXTERN(jvirt_barray_ptr *) jpeg_read_coefficients (j_decompress_ptr cinfo); +EXTERN(void) jpeg_write_coefficients (j_compress_ptr cinfo, + jvirt_barray_ptr *coef_arrays); +EXTERN(void) jpeg_copy_critical_parameters (j_decompress_ptr srcinfo, + j_compress_ptr dstinfo); + +/* If you choose to abort compression or decompression before completing + * jpeg_finish_(de)compress, then you need to clean up to release memory, + * temporary files, etc. You can just call jpeg_destroy_(de)compress + * if you're done with the JPEG object, but if you want to clean it up and + * reuse it, call this: + */ +EXTERN(void) jpeg_abort_compress (j_compress_ptr cinfo); +EXTERN(void) jpeg_abort_decompress (j_decompress_ptr cinfo); + +/* Generic versions of jpeg_abort and jpeg_destroy that work on either + * flavor of JPEG object. These may be more convenient in some places. + */ +EXTERN(void) jpeg_abort (j_common_ptr cinfo); +EXTERN(void) jpeg_destroy (j_common_ptr cinfo); + +/* Default restart-marker-resync procedure for use by data source modules */ +EXTERN(boolean) jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired); + + +/* These marker codes are exported since applications and data source modules + * are likely to want to use them. + */ + +#define JPEG_RST0 0xD0 /* RST0 marker code */ +#define JPEG_EOI 0xD9 /* EOI marker code */ +#define JPEG_APP0 0xE0 /* APP0 marker code */ +#define JPEG_COM 0xFE /* COM marker code */ + + +/* If we have a brain-damaged compiler that emits warnings (or worse, errors) + * for structure definitions that are never filled in, keep it quiet by + * supplying dummy definitions for the various substructures. + */ + +#ifdef INCOMPLETE_TYPES_BROKEN +#ifndef JPEG_INTERNALS /* will be defined in jpegint.h */ +struct jvirt_sarray_control { long dummy; }; +struct jvirt_barray_control { long dummy; }; +struct jpeg_comp_master { long dummy; }; +struct jpeg_c_main_controller { long dummy; }; +struct jpeg_c_prep_controller { long dummy; }; +struct jpeg_c_coef_controller { long dummy; }; +struct jpeg_marker_writer { long dummy; }; +struct jpeg_color_converter { long dummy; }; +struct jpeg_downsampler { long dummy; }; +struct jpeg_forward_dct { long dummy; }; +struct jpeg_entropy_encoder { long dummy; }; +struct jpeg_decomp_master { long dummy; }; +struct jpeg_d_main_controller { long dummy; }; +struct jpeg_d_coef_controller { long dummy; }; +struct jpeg_d_post_controller { long dummy; }; +struct jpeg_input_controller { long dummy; }; +struct jpeg_marker_reader { long dummy; }; +struct jpeg_entropy_decoder { long dummy; }; +struct jpeg_inverse_dct { long dummy; }; +struct jpeg_upsampler { long dummy; }; +struct jpeg_color_deconverter { long dummy; }; +struct jpeg_color_quantizer { long dummy; }; +#endif /* JPEG_INTERNALS */ +#endif /* INCOMPLETE_TYPES_BROKEN */ + + +/* + * The JPEG library modules define JPEG_INTERNALS before including this file. + * The internal structure declarations are read only when that is true. + * Applications using the library should not include jpegint.h, but may wish + * to include jerror.h. + */ + +#ifdef JPEG_INTERNALS +#include "jpegint.h" /* fetch private declarations */ +#include "jerror.h" /* fetch error codes too */ +#endif + +#ifdef __cplusplus +#ifndef DONT_USE_EXTERN_C +} +#endif +#endif + +#endif /* JPEGLIB_H */ diff --git a/media/libjpeg/jquant1.c b/media/libjpeg/jquant1.c new file mode 100644 index 000000000..e7814815e --- /dev/null +++ b/media/libjpeg/jquant1.c @@ -0,0 +1,857 @@ +/* + * jquant1.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1996, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2009, 2015, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains 1-pass color quantization (color mapping) routines. + * These routines provide mapping to a fixed color map using equally spaced + * color values. Optional Floyd-Steinberg or ordered dithering is available. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" + +#ifdef QUANT_1PASS_SUPPORTED + + +/* + * The main purpose of 1-pass quantization is to provide a fast, if not very + * high quality, colormapped output capability. A 2-pass quantizer usually + * gives better visual quality; however, for quantized grayscale output this + * quantizer is perfectly adequate. Dithering is highly recommended with this + * quantizer, though you can turn it off if you really want to. + * + * In 1-pass quantization the colormap must be chosen in advance of seeing the + * image. We use a map consisting of all combinations of Ncolors[i] color + * values for the i'th component. The Ncolors[] values are chosen so that + * their product, the total number of colors, is no more than that requested. + * (In most cases, the product will be somewhat less.) + * + * Since the colormap is orthogonal, the representative value for each color + * component can be determined without considering the other components; + * then these indexes can be combined into a colormap index by a standard + * N-dimensional-array-subscript calculation. Most of the arithmetic involved + * can be precalculated and stored in the lookup table colorindex[]. + * colorindex[i][j] maps pixel value j in component i to the nearest + * representative value (grid plane) for that component; this index is + * multiplied by the array stride for component i, so that the + * index of the colormap entry closest to a given pixel value is just + * sum( colorindex[component-number][pixel-component-value] ) + * Aside from being fast, this scheme allows for variable spacing between + * representative values with no additional lookup cost. + * + * If gamma correction has been applied in color conversion, it might be wise + * to adjust the color grid spacing so that the representative colors are + * equidistant in linear space. At this writing, gamma correction is not + * implemented by jdcolor, so nothing is done here. + */ + + +/* Declarations for ordered dithering. + * + * We use a standard 16x16 ordered dither array. The basic concept of ordered + * dithering is described in many references, for instance Dale Schumacher's + * chapter II.2 of Graphics Gems II (James Arvo, ed. Academic Press, 1991). + * In place of Schumacher's comparisons against a "threshold" value, we add a + * "dither" value to the input pixel and then round the result to the nearest + * output value. The dither value is equivalent to (0.5 - threshold) times + * the distance between output values. For ordered dithering, we assume that + * the output colors are equally spaced; if not, results will probably be + * worse, since the dither may be too much or too little at a given point. + * + * The normal calculation would be to form pixel value + dither, range-limit + * this to 0..MAXJSAMPLE, and then index into the colorindex table as usual. + * We can skip the separate range-limiting step by extending the colorindex + * table in both directions. + */ + +#define ODITHER_SIZE 16 /* dimension of dither matrix */ +/* NB: if ODITHER_SIZE is not a power of 2, ODITHER_MASK uses will break */ +#define ODITHER_CELLS (ODITHER_SIZE*ODITHER_SIZE) /* # cells in matrix */ +#define ODITHER_MASK (ODITHER_SIZE-1) /* mask for wrapping around counters */ + +typedef int ODITHER_MATRIX[ODITHER_SIZE][ODITHER_SIZE]; +typedef int (*ODITHER_MATRIX_PTR)[ODITHER_SIZE]; + +static const UINT8 base_dither_matrix[ODITHER_SIZE][ODITHER_SIZE] = { + /* Bayer's order-4 dither array. Generated by the code given in + * Stephen Hawley's article "Ordered Dithering" in Graphics Gems I. + * The values in this array must range from 0 to ODITHER_CELLS-1. + */ + { 0,192, 48,240, 12,204, 60,252, 3,195, 51,243, 15,207, 63,255 }, + { 128, 64,176,112,140, 76,188,124,131, 67,179,115,143, 79,191,127 }, + { 32,224, 16,208, 44,236, 28,220, 35,227, 19,211, 47,239, 31,223 }, + { 160, 96,144, 80,172,108,156, 92,163, 99,147, 83,175,111,159, 95 }, + { 8,200, 56,248, 4,196, 52,244, 11,203, 59,251, 7,199, 55,247 }, + { 136, 72,184,120,132, 68,180,116,139, 75,187,123,135, 71,183,119 }, + { 40,232, 24,216, 36,228, 20,212, 43,235, 27,219, 39,231, 23,215 }, + { 168,104,152, 88,164,100,148, 84,171,107,155, 91,167,103,151, 87 }, + { 2,194, 50,242, 14,206, 62,254, 1,193, 49,241, 13,205, 61,253 }, + { 130, 66,178,114,142, 78,190,126,129, 65,177,113,141, 77,189,125 }, + { 34,226, 18,210, 46,238, 30,222, 33,225, 17,209, 45,237, 29,221 }, + { 162, 98,146, 82,174,110,158, 94,161, 97,145, 81,173,109,157, 93 }, + { 10,202, 58,250, 6,198, 54,246, 9,201, 57,249, 5,197, 53,245 }, + { 138, 74,186,122,134, 70,182,118,137, 73,185,121,133, 69,181,117 }, + { 42,234, 26,218, 38,230, 22,214, 41,233, 25,217, 37,229, 21,213 }, + { 170,106,154, 90,166,102,150, 86,169,105,153, 89,165,101,149, 85 } +}; + + +/* Declarations for Floyd-Steinberg dithering. + * + * Errors are accumulated into the array fserrors[], at a resolution of + * 1/16th of a pixel count. The error at a given pixel is propagated + * to its not-yet-processed neighbors using the standard F-S fractions, + * ... (here) 7/16 + * 3/16 5/16 1/16 + * We work left-to-right on even rows, right-to-left on odd rows. + * + * We can get away with a single array (holding one row's worth of errors) + * by using it to store the current row's errors at pixel columns not yet + * processed, but the next row's errors at columns already processed. We + * need only a few extra variables to hold the errors immediately around the + * current column. (If we are lucky, those variables are in registers, but + * even if not, they're probably cheaper to access than array elements are.) + * + * The fserrors[] array is indexed [component#][position]. + * We provide (#columns + 2) entries per component; the extra entry at each + * end saves us from special-casing the first and last pixels. + */ + +#if BITS_IN_JSAMPLE == 8 +typedef INT16 FSERROR; /* 16 bits should be enough */ +typedef int LOCFSERROR; /* use 'int' for calculation temps */ +#else +typedef JLONG FSERROR; /* may need more than 16 bits */ +typedef JLONG LOCFSERROR; /* be sure calculation temps are big enough */ +#endif + +typedef FSERROR *FSERRPTR; /* pointer to error array */ + + +/* Private subobject */ + +#define MAX_Q_COMPS 4 /* max components I can handle */ + +typedef struct { + struct jpeg_color_quantizer pub; /* public fields */ + + /* Initially allocated colormap is saved here */ + JSAMPARRAY sv_colormap; /* The color map as a 2-D pixel array */ + int sv_actual; /* number of entries in use */ + + JSAMPARRAY colorindex; /* Precomputed mapping for speed */ + /* colorindex[i][j] = index of color closest to pixel value j in component i, + * premultiplied as described above. Since colormap indexes must fit into + * JSAMPLEs, the entries of this array will too. + */ + boolean is_padded; /* is the colorindex padded for odither? */ + + int Ncolors[MAX_Q_COMPS]; /* # of values alloced to each component */ + + /* Variables for ordered dithering */ + int row_index; /* cur row's vertical index in dither matrix */ + ODITHER_MATRIX_PTR odither[MAX_Q_COMPS]; /* one dither array per component */ + + /* Variables for Floyd-Steinberg dithering */ + FSERRPTR fserrors[MAX_Q_COMPS]; /* accumulated errors */ + boolean on_odd_row; /* flag to remember which row we are on */ +} my_cquantizer; + +typedef my_cquantizer *my_cquantize_ptr; + + +/* + * Policy-making subroutines for create_colormap and create_colorindex. + * These routines determine the colormap to be used. The rest of the module + * only assumes that the colormap is orthogonal. + * + * * select_ncolors decides how to divvy up the available colors + * among the components. + * * output_value defines the set of representative values for a component. + * * largest_input_value defines the mapping from input values to + * representative values for a component. + * Note that the latter two routines may impose different policies for + * different components, though this is not currently done. + */ + + +LOCAL(int) +select_ncolors (j_decompress_ptr cinfo, int Ncolors[]) +/* Determine allocation of desired colors to components, */ +/* and fill in Ncolors[] array to indicate choice. */ +/* Return value is total number of colors (product of Ncolors[] values). */ +{ + int nc = cinfo->out_color_components; /* number of color components */ + int max_colors = cinfo->desired_number_of_colors; + int total_colors, iroot, i, j; + boolean changed; + long temp; + int RGB_order[3] = { RGB_GREEN, RGB_RED, RGB_BLUE }; + RGB_order[0] = rgb_green[cinfo->out_color_space]; + RGB_order[1] = rgb_red[cinfo->out_color_space]; + RGB_order[2] = rgb_blue[cinfo->out_color_space]; + + /* We can allocate at least the nc'th root of max_colors per component. */ + /* Compute floor(nc'th root of max_colors). */ + iroot = 1; + do { + iroot++; + temp = iroot; /* set temp = iroot ** nc */ + for (i = 1; i < nc; i++) + temp *= iroot; + } while (temp <= (long) max_colors); /* repeat till iroot exceeds root */ + iroot--; /* now iroot = floor(root) */ + + /* Must have at least 2 color values per component */ + if (iroot < 2) + ERREXIT1(cinfo, JERR_QUANT_FEW_COLORS, (int) temp); + + /* Initialize to iroot color values for each component */ + total_colors = 1; + for (i = 0; i < nc; i++) { + Ncolors[i] = iroot; + total_colors *= iroot; + } + /* We may be able to increment the count for one or more components without + * exceeding max_colors, though we know not all can be incremented. + * Sometimes, the first component can be incremented more than once! + * (Example: for 16 colors, we start at 2*2*2, go to 3*2*2, then 4*2*2.) + * In RGB colorspace, try to increment G first, then R, then B. + */ + do { + changed = FALSE; + for (i = 0; i < nc; i++) { + j = (cinfo->out_color_space == JCS_RGB ? RGB_order[i] : i); + /* calculate new total_colors if Ncolors[j] is incremented */ + temp = total_colors / Ncolors[j]; + temp *= Ncolors[j]+1; /* done in long arith to avoid oflo */ + if (temp > (long) max_colors) + break; /* won't fit, done with this pass */ + Ncolors[j]++; /* OK, apply the increment */ + total_colors = (int) temp; + changed = TRUE; + } + } while (changed); + + return total_colors; +} + + +LOCAL(int) +output_value (j_decompress_ptr cinfo, int ci, int j, int maxj) +/* Return j'th output value, where j will range from 0 to maxj */ +/* The output values must fall in 0..MAXJSAMPLE in increasing order */ +{ + /* We always provide values 0 and MAXJSAMPLE for each component; + * any additional values are equally spaced between these limits. + * (Forcing the upper and lower values to the limits ensures that + * dithering can't produce a color outside the selected gamut.) + */ + return (int) (((JLONG) j * MAXJSAMPLE + maxj/2) / maxj); +} + + +LOCAL(int) +largest_input_value (j_decompress_ptr cinfo, int ci, int j, int maxj) +/* Return largest input value that should map to j'th output value */ +/* Must have largest(j=0) >= 0, and largest(j=maxj) >= MAXJSAMPLE */ +{ + /* Breakpoints are halfway between values returned by output_value */ + return (int) (((JLONG) (2*j + 1) * MAXJSAMPLE + maxj) / (2*maxj)); +} + + +/* + * Create the colormap. + */ + +LOCAL(void) +create_colormap (j_decompress_ptr cinfo) +{ + my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; + JSAMPARRAY colormap; /* Created colormap */ + int total_colors; /* Number of distinct output colors */ + int i,j,k, nci, blksize, blkdist, ptr, val; + + /* Select number of colors for each component */ + total_colors = select_ncolors(cinfo, cquantize->Ncolors); + + /* Report selected color counts */ + if (cinfo->out_color_components == 3) + TRACEMS4(cinfo, 1, JTRC_QUANT_3_NCOLORS, + total_colors, cquantize->Ncolors[0], + cquantize->Ncolors[1], cquantize->Ncolors[2]); + else + TRACEMS1(cinfo, 1, JTRC_QUANT_NCOLORS, total_colors); + + /* Allocate and fill in the colormap. */ + /* The colors are ordered in the map in standard row-major order, */ + /* i.e. rightmost (highest-indexed) color changes most rapidly. */ + + colormap = (*cinfo->mem->alloc_sarray) + ((j_common_ptr) cinfo, JPOOL_IMAGE, + (JDIMENSION) total_colors, (JDIMENSION) cinfo->out_color_components); + + /* blksize is number of adjacent repeated entries for a component */ + /* blkdist is distance between groups of identical entries for a component */ + blkdist = total_colors; + + for (i = 0; i < cinfo->out_color_components; i++) { + /* fill in colormap entries for i'th color component */ + nci = cquantize->Ncolors[i]; /* # of distinct values for this color */ + blksize = blkdist / nci; + for (j = 0; j < nci; j++) { + /* Compute j'th output value (out of nci) for component */ + val = output_value(cinfo, i, j, nci-1); + /* Fill in all colormap entries that have this value of this component */ + for (ptr = j * blksize; ptr < total_colors; ptr += blkdist) { + /* fill in blksize entries beginning at ptr */ + for (k = 0; k < blksize; k++) + colormap[i][ptr+k] = (JSAMPLE) val; + } + } + blkdist = blksize; /* blksize of this color is blkdist of next */ + } + + /* Save the colormap in private storage, + * where it will survive color quantization mode changes. + */ + cquantize->sv_colormap = colormap; + cquantize->sv_actual = total_colors; +} + + +/* + * Create the color index table. + */ + +LOCAL(void) +create_colorindex (j_decompress_ptr cinfo) +{ + my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; + JSAMPROW indexptr; + int i,j,k, nci, blksize, val, pad; + + /* For ordered dither, we pad the color index tables by MAXJSAMPLE in + * each direction (input index values can be -MAXJSAMPLE .. 2*MAXJSAMPLE). + * This is not necessary in the other dithering modes. However, we + * flag whether it was done in case user changes dithering mode. + */ + if (cinfo->dither_mode == JDITHER_ORDERED) { + pad = MAXJSAMPLE*2; + cquantize->is_padded = TRUE; + } else { + pad = 0; + cquantize->is_padded = FALSE; + } + + cquantize->colorindex = (*cinfo->mem->alloc_sarray) + ((j_common_ptr) cinfo, JPOOL_IMAGE, + (JDIMENSION) (MAXJSAMPLE+1 + pad), + (JDIMENSION) cinfo->out_color_components); + + /* blksize is number of adjacent repeated entries for a component */ + blksize = cquantize->sv_actual; + + for (i = 0; i < cinfo->out_color_components; i++) { + /* fill in colorindex entries for i'th color component */ + nci = cquantize->Ncolors[i]; /* # of distinct values for this color */ + blksize = blksize / nci; + + /* adjust colorindex pointers to provide padding at negative indexes. */ + if (pad) + cquantize->colorindex[i] += MAXJSAMPLE; + + /* in loop, val = index of current output value, */ + /* and k = largest j that maps to current val */ + indexptr = cquantize->colorindex[i]; + val = 0; + k = largest_input_value(cinfo, i, 0, nci-1); + for (j = 0; j <= MAXJSAMPLE; j++) { + while (j > k) /* advance val if past boundary */ + k = largest_input_value(cinfo, i, ++val, nci-1); + /* premultiply so that no multiplication needed in main processing */ + indexptr[j] = (JSAMPLE) (val * blksize); + } + /* Pad at both ends if necessary */ + if (pad) + for (j = 1; j <= MAXJSAMPLE; j++) { + indexptr[-j] = indexptr[0]; + indexptr[MAXJSAMPLE+j] = indexptr[MAXJSAMPLE]; + } + } +} + + +/* + * Create an ordered-dither array for a component having ncolors + * distinct output values. + */ + +LOCAL(ODITHER_MATRIX_PTR) +make_odither_array (j_decompress_ptr cinfo, int ncolors) +{ + ODITHER_MATRIX_PTR odither; + int j,k; + JLONG num,den; + + odither = (ODITHER_MATRIX_PTR) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(ODITHER_MATRIX)); + /* The inter-value distance for this color is MAXJSAMPLE/(ncolors-1). + * Hence the dither value for the matrix cell with fill order f + * (f=0..N-1) should be (N-1-2*f)/(2*N) * MAXJSAMPLE/(ncolors-1). + * On 16-bit-int machine, be careful to avoid overflow. + */ + den = 2 * ODITHER_CELLS * ((JLONG) (ncolors - 1)); + for (j = 0; j < ODITHER_SIZE; j++) { + for (k = 0; k < ODITHER_SIZE; k++) { + num = ((JLONG) (ODITHER_CELLS-1 - 2*((int)base_dither_matrix[j][k]))) + * MAXJSAMPLE; + /* Ensure round towards zero despite C's lack of consistency + * about rounding negative values in integer division... + */ + odither[j][k] = (int) (num<0 ? -((-num)/den) : num/den); + } + } + return odither; +} + + +/* + * Create the ordered-dither tables. + * Components having the same number of representative colors may + * share a dither table. + */ + +LOCAL(void) +create_odither_tables (j_decompress_ptr cinfo) +{ + my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; + ODITHER_MATRIX_PTR odither; + int i, j, nci; + + for (i = 0; i < cinfo->out_color_components; i++) { + nci = cquantize->Ncolors[i]; /* # of distinct values for this color */ + odither = NULL; /* search for matching prior component */ + for (j = 0; j < i; j++) { + if (nci == cquantize->Ncolors[j]) { + odither = cquantize->odither[j]; + break; + } + } + if (odither == NULL) /* need a new table? */ + odither = make_odither_array(cinfo, nci); + cquantize->odither[i] = odither; + } +} + + +/* + * Map some rows of pixels to the output colormapped representation. + */ + +METHODDEF(void) +color_quantize (j_decompress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPARRAY output_buf, int num_rows) +/* General case, no dithering */ +{ + my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; + JSAMPARRAY colorindex = cquantize->colorindex; + register int pixcode, ci; + register JSAMPROW ptrin, ptrout; + int row; + JDIMENSION col; + JDIMENSION width = cinfo->output_width; + register int nc = cinfo->out_color_components; + + for (row = 0; row < num_rows; row++) { + ptrin = input_buf[row]; + ptrout = output_buf[row]; + for (col = width; col > 0; col--) { + pixcode = 0; + for (ci = 0; ci < nc; ci++) { + pixcode += GETJSAMPLE(colorindex[ci][GETJSAMPLE(*ptrin++)]); + } + *ptrout++ = (JSAMPLE) pixcode; + } + } +} + + +METHODDEF(void) +color_quantize3 (j_decompress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPARRAY output_buf, int num_rows) +/* Fast path for out_color_components==3, no dithering */ +{ + my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; + register int pixcode; + register JSAMPROW ptrin, ptrout; + JSAMPROW colorindex0 = cquantize->colorindex[0]; + JSAMPROW colorindex1 = cquantize->colorindex[1]; + JSAMPROW colorindex2 = cquantize->colorindex[2]; + int row; + JDIMENSION col; + JDIMENSION width = cinfo->output_width; + + for (row = 0; row < num_rows; row++) { + ptrin = input_buf[row]; + ptrout = output_buf[row]; + for (col = width; col > 0; col--) { + pixcode = GETJSAMPLE(colorindex0[GETJSAMPLE(*ptrin++)]); + pixcode += GETJSAMPLE(colorindex1[GETJSAMPLE(*ptrin++)]); + pixcode += GETJSAMPLE(colorindex2[GETJSAMPLE(*ptrin++)]); + *ptrout++ = (JSAMPLE) pixcode; + } + } +} + + +METHODDEF(void) +quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPARRAY output_buf, int num_rows) +/* General case, with ordered dithering */ +{ + my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; + register JSAMPROW input_ptr; + register JSAMPROW output_ptr; + JSAMPROW colorindex_ci; + int *dither; /* points to active row of dither matrix */ + int row_index, col_index; /* current indexes into dither matrix */ + int nc = cinfo->out_color_components; + int ci; + int row; + JDIMENSION col; + JDIMENSION width = cinfo->output_width; + + for (row = 0; row < num_rows; row++) { + /* Initialize output values to 0 so can process components separately */ + jzero_far((void *) output_buf[row], (size_t) (width * sizeof(JSAMPLE))); + row_index = cquantize->row_index; + for (ci = 0; ci < nc; ci++) { + input_ptr = input_buf[row] + ci; + output_ptr = output_buf[row]; + colorindex_ci = cquantize->colorindex[ci]; + dither = cquantize->odither[ci][row_index]; + col_index = 0; + + for (col = width; col > 0; col--) { + /* Form pixel value + dither, range-limit to 0..MAXJSAMPLE, + * select output value, accumulate into output code for this pixel. + * Range-limiting need not be done explicitly, as we have extended + * the colorindex table to produce the right answers for out-of-range + * inputs. The maximum dither is +- MAXJSAMPLE; this sets the + * required amount of padding. + */ + *output_ptr += colorindex_ci[GETJSAMPLE(*input_ptr)+dither[col_index]]; + input_ptr += nc; + output_ptr++; + col_index = (col_index + 1) & ODITHER_MASK; + } + } + /* Advance row index for next row */ + row_index = (row_index + 1) & ODITHER_MASK; + cquantize->row_index = row_index; + } +} + + +METHODDEF(void) +quantize3_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPARRAY output_buf, int num_rows) +/* Fast path for out_color_components==3, with ordered dithering */ +{ + my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; + register int pixcode; + register JSAMPROW input_ptr; + register JSAMPROW output_ptr; + JSAMPROW colorindex0 = cquantize->colorindex[0]; + JSAMPROW colorindex1 = cquantize->colorindex[1]; + JSAMPROW colorindex2 = cquantize->colorindex[2]; + int *dither0; /* points to active row of dither matrix */ + int *dither1; + int *dither2; + int row_index, col_index; /* current indexes into dither matrix */ + int row; + JDIMENSION col; + JDIMENSION width = cinfo->output_width; + + for (row = 0; row < num_rows; row++) { + row_index = cquantize->row_index; + input_ptr = input_buf[row]; + output_ptr = output_buf[row]; + dither0 = cquantize->odither[0][row_index]; + dither1 = cquantize->odither[1][row_index]; + dither2 = cquantize->odither[2][row_index]; + col_index = 0; + + for (col = width; col > 0; col--) { + pixcode = GETJSAMPLE(colorindex0[GETJSAMPLE(*input_ptr++) + + dither0[col_index]]); + pixcode += GETJSAMPLE(colorindex1[GETJSAMPLE(*input_ptr++) + + dither1[col_index]]); + pixcode += GETJSAMPLE(colorindex2[GETJSAMPLE(*input_ptr++) + + dither2[col_index]]); + *output_ptr++ = (JSAMPLE) pixcode; + col_index = (col_index + 1) & ODITHER_MASK; + } + row_index = (row_index + 1) & ODITHER_MASK; + cquantize->row_index = row_index; + } +} + + +METHODDEF(void) +quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPARRAY output_buf, int num_rows) +/* General case, with Floyd-Steinberg dithering */ +{ + my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; + register LOCFSERROR cur; /* current error or pixel value */ + LOCFSERROR belowerr; /* error for pixel below cur */ + LOCFSERROR bpreverr; /* error for below/prev col */ + LOCFSERROR bnexterr; /* error for below/next col */ + LOCFSERROR delta; + register FSERRPTR errorptr; /* => fserrors[] at column before current */ + register JSAMPROW input_ptr; + register JSAMPROW output_ptr; + JSAMPROW colorindex_ci; + JSAMPROW colormap_ci; + int pixcode; + int nc = cinfo->out_color_components; + int dir; /* 1 for left-to-right, -1 for right-to-left */ + int dirnc; /* dir * nc */ + int ci; + int row; + JDIMENSION col; + JDIMENSION width = cinfo->output_width; + JSAMPLE *range_limit = cinfo->sample_range_limit; + SHIFT_TEMPS + + for (row = 0; row < num_rows; row++) { + /* Initialize output values to 0 so can process components separately */ + jzero_far((void *) output_buf[row], (size_t) (width * sizeof(JSAMPLE))); + for (ci = 0; ci < nc; ci++) { + input_ptr = input_buf[row] + ci; + output_ptr = output_buf[row]; + if (cquantize->on_odd_row) { + /* work right to left in this row */ + input_ptr += (width-1) * nc; /* so point to rightmost pixel */ + output_ptr += width-1; + dir = -1; + dirnc = -nc; + errorptr = cquantize->fserrors[ci] + (width+1); /* => entry after last column */ + } else { + /* work left to right in this row */ + dir = 1; + dirnc = nc; + errorptr = cquantize->fserrors[ci]; /* => entry before first column */ + } + colorindex_ci = cquantize->colorindex[ci]; + colormap_ci = cquantize->sv_colormap[ci]; + /* Preset error values: no error propagated to first pixel from left */ + cur = 0; + /* and no error propagated to row below yet */ + belowerr = bpreverr = 0; + + for (col = width; col > 0; col--) { + /* cur holds the error propagated from the previous pixel on the + * current line. Add the error propagated from the previous line + * to form the complete error correction term for this pixel, and + * round the error term (which is expressed * 16) to an integer. + * RIGHT_SHIFT rounds towards minus infinity, so adding 8 is correct + * for either sign of the error value. + * Note: errorptr points to *previous* column's array entry. + */ + cur = RIGHT_SHIFT(cur + errorptr[dir] + 8, 4); + /* Form pixel value + error, and range-limit to 0..MAXJSAMPLE. + * The maximum error is +- MAXJSAMPLE; this sets the required size + * of the range_limit array. + */ + cur += GETJSAMPLE(*input_ptr); + cur = GETJSAMPLE(range_limit[cur]); + /* Select output value, accumulate into output code for this pixel */ + pixcode = GETJSAMPLE(colorindex_ci[cur]); + *output_ptr += (JSAMPLE) pixcode; + /* Compute actual representation error at this pixel */ + /* Note: we can do this even though we don't have the final */ + /* pixel code, because the colormap is orthogonal. */ + cur -= GETJSAMPLE(colormap_ci[pixcode]); + /* Compute error fractions to be propagated to adjacent pixels. + * Add these into the running sums, and simultaneously shift the + * next-line error sums left by 1 column. + */ + bnexterr = cur; + delta = cur * 2; + cur += delta; /* form error * 3 */ + errorptr[0] = (FSERROR) (bpreverr + cur); + cur += delta; /* form error * 5 */ + bpreverr = belowerr + cur; + belowerr = bnexterr; + cur += delta; /* form error * 7 */ + /* At this point cur contains the 7/16 error value to be propagated + * to the next pixel on the current line, and all the errors for the + * next line have been shifted over. We are therefore ready to move on. + */ + input_ptr += dirnc; /* advance input ptr to next column */ + output_ptr += dir; /* advance output ptr to next column */ + errorptr += dir; /* advance errorptr to current column */ + } + /* Post-loop cleanup: we must unload the final error value into the + * final fserrors[] entry. Note we need not unload belowerr because + * it is for the dummy column before or after the actual array. + */ + errorptr[0] = (FSERROR) bpreverr; /* unload prev err into array */ + } + cquantize->on_odd_row = (cquantize->on_odd_row ? FALSE : TRUE); + } +} + + +/* + * Allocate workspace for Floyd-Steinberg errors. + */ + +LOCAL(void) +alloc_fs_workspace (j_decompress_ptr cinfo) +{ + my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; + size_t arraysize; + int i; + + arraysize = (size_t) ((cinfo->output_width + 2) * sizeof(FSERROR)); + for (i = 0; i < cinfo->out_color_components; i++) { + cquantize->fserrors[i] = (FSERRPTR) + (*cinfo->mem->alloc_large)((j_common_ptr) cinfo, JPOOL_IMAGE, arraysize); + } +} + + +/* + * Initialize for one-pass color quantization. + */ + +METHODDEF(void) +start_pass_1_quant (j_decompress_ptr cinfo, boolean is_pre_scan) +{ + my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; + size_t arraysize; + int i; + + /* Install my colormap. */ + cinfo->colormap = cquantize->sv_colormap; + cinfo->actual_number_of_colors = cquantize->sv_actual; + + /* Initialize for desired dithering mode. */ + switch (cinfo->dither_mode) { + case JDITHER_NONE: + if (cinfo->out_color_components == 3) + cquantize->pub.color_quantize = color_quantize3; + else + cquantize->pub.color_quantize = color_quantize; + break; + case JDITHER_ORDERED: + if (cinfo->out_color_components == 3) + cquantize->pub.color_quantize = quantize3_ord_dither; + else + cquantize->pub.color_quantize = quantize_ord_dither; + cquantize->row_index = 0; /* initialize state for ordered dither */ + /* If user changed to ordered dither from another mode, + * we must recreate the color index table with padding. + * This will cost extra space, but probably isn't very likely. + */ + if (! cquantize->is_padded) + create_colorindex(cinfo); + /* Create ordered-dither tables if we didn't already. */ + if (cquantize->odither[0] == NULL) + create_odither_tables(cinfo); + break; + case JDITHER_FS: + cquantize->pub.color_quantize = quantize_fs_dither; + cquantize->on_odd_row = FALSE; /* initialize state for F-S dither */ + /* Allocate Floyd-Steinberg workspace if didn't already. */ + if (cquantize->fserrors[0] == NULL) + alloc_fs_workspace(cinfo); + /* Initialize the propagated errors to zero. */ + arraysize = (size_t) ((cinfo->output_width + 2) * sizeof(FSERROR)); + for (i = 0; i < cinfo->out_color_components; i++) + jzero_far((void *) cquantize->fserrors[i], arraysize); + break; + default: + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + } +} + + +/* + * Finish up at the end of the pass. + */ + +METHODDEF(void) +finish_pass_1_quant (j_decompress_ptr cinfo) +{ + /* no work in 1-pass case */ +} + + +/* + * Switch to a new external colormap between output passes. + * Shouldn't get to this module! + */ + +METHODDEF(void) +new_color_map_1_quant (j_decompress_ptr cinfo) +{ + ERREXIT(cinfo, JERR_MODE_CHANGE); +} + + +/* + * Module initialization routine for 1-pass color quantization. + */ + +GLOBAL(void) +jinit_1pass_quantizer (j_decompress_ptr cinfo) +{ + my_cquantize_ptr cquantize; + + cquantize = (my_cquantize_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(my_cquantizer)); + cinfo->cquantize = (struct jpeg_color_quantizer *) cquantize; + cquantize->pub.start_pass = start_pass_1_quant; + cquantize->pub.finish_pass = finish_pass_1_quant; + cquantize->pub.new_color_map = new_color_map_1_quant; + cquantize->fserrors[0] = NULL; /* Flag FS workspace not allocated */ + cquantize->odither[0] = NULL; /* Also flag odither arrays not allocated */ + + /* Make sure my internal arrays won't overflow */ + if (cinfo->out_color_components > MAX_Q_COMPS) + ERREXIT1(cinfo, JERR_QUANT_COMPONENTS, MAX_Q_COMPS); + /* Make sure colormap indexes can be represented by JSAMPLEs */ + if (cinfo->desired_number_of_colors > (MAXJSAMPLE+1)) + ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXJSAMPLE+1); + + /* Create the colormap and color index table. */ + create_colormap(cinfo); + create_colorindex(cinfo); + + /* Allocate Floyd-Steinberg workspace now if requested. + * We do this now since it may affect the memory manager's space + * calculations. If the user changes to FS dither mode in a later pass, we + * will allocate the space then, and will possibly overrun the + * max_memory_to_use setting. + */ + if (cinfo->dither_mode == JDITHER_FS) + alloc_fs_workspace(cinfo); +} + +#endif /* QUANT_1PASS_SUPPORTED */ diff --git a/media/libjpeg/jquant2.c b/media/libjpeg/jquant2.c new file mode 100644 index 000000000..cfbd0f152 --- /dev/null +++ b/media/libjpeg/jquant2.c @@ -0,0 +1,1282 @@ +/* + * jquant2.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1996, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2009, 2014-2015, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains 2-pass color quantization (color mapping) routines. + * These routines provide selection of a custom color map for an image, + * followed by mapping of the image to that color map, with optional + * Floyd-Steinberg dithering. + * It is also possible to use just the second pass to map to an arbitrary + * externally-given color map. + * + * Note: ordered dithering is not supported, since there isn't any fast + * way to compute intercolor distances; it's unclear that ordered dither's + * fundamental assumptions even hold with an irregularly spaced color map. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" + +#ifdef QUANT_2PASS_SUPPORTED + + +/* + * This module implements the well-known Heckbert paradigm for color + * quantization. Most of the ideas used here can be traced back to + * Heckbert's seminal paper + * Heckbert, Paul. "Color Image Quantization for Frame Buffer Display", + * Proc. SIGGRAPH '82, Computer Graphics v.16 #3 (July 1982), pp 297-304. + * + * In the first pass over the image, we accumulate a histogram showing the + * usage count of each possible color. To keep the histogram to a reasonable + * size, we reduce the precision of the input; typical practice is to retain + * 5 or 6 bits per color, so that 8 or 4 different input values are counted + * in the same histogram cell. + * + * Next, the color-selection step begins with a box representing the whole + * color space, and repeatedly splits the "largest" remaining box until we + * have as many boxes as desired colors. Then the mean color in each + * remaining box becomes one of the possible output colors. + * + * The second pass over the image maps each input pixel to the closest output + * color (optionally after applying a Floyd-Steinberg dithering correction). + * This mapping is logically trivial, but making it go fast enough requires + * considerable care. + * + * Heckbert-style quantizers vary a good deal in their policies for choosing + * the "largest" box and deciding where to cut it. The particular policies + * used here have proved out well in experimental comparisons, but better ones + * may yet be found. + * + * In earlier versions of the IJG code, this module quantized in YCbCr color + * space, processing the raw upsampled data without a color conversion step. + * This allowed the color conversion math to be done only once per colormap + * entry, not once per pixel. However, that optimization precluded other + * useful optimizations (such as merging color conversion with upsampling) + * and it also interfered with desired capabilities such as quantizing to an + * externally-supplied colormap. We have therefore abandoned that approach. + * The present code works in the post-conversion color space, typically RGB. + * + * To improve the visual quality of the results, we actually work in scaled + * RGB space, giving G distances more weight than R, and R in turn more than + * B. To do everything in integer math, we must use integer scale factors. + * The 2/3/1 scale factors used here correspond loosely to the relative + * weights of the colors in the NTSC grayscale equation. + * If you want to use this code to quantize a non-RGB color space, you'll + * probably need to change these scale factors. + */ + +#define R_SCALE 2 /* scale R distances by this much */ +#define G_SCALE 3 /* scale G distances by this much */ +#define B_SCALE 1 /* and B by this much */ + +static const int c_scales[3]={R_SCALE, G_SCALE, B_SCALE}; +#define C0_SCALE c_scales[rgb_red[cinfo->out_color_space]] +#define C1_SCALE c_scales[rgb_green[cinfo->out_color_space]] +#define C2_SCALE c_scales[rgb_blue[cinfo->out_color_space]] + +/* + * First we have the histogram data structure and routines for creating it. + * + * The number of bits of precision can be adjusted by changing these symbols. + * We recommend keeping 6 bits for G and 5 each for R and B. + * If you have plenty of memory and cycles, 6 bits all around gives marginally + * better results; if you are short of memory, 5 bits all around will save + * some space but degrade the results. + * To maintain a fully accurate histogram, we'd need to allocate a "long" + * (preferably unsigned long) for each cell. In practice this is overkill; + * we can get by with 16 bits per cell. Few of the cell counts will overflow, + * and clamping those that do overflow to the maximum value will give close- + * enough results. This reduces the recommended histogram size from 256Kb + * to 128Kb, which is a useful savings on PC-class machines. + * (In the second pass the histogram space is re-used for pixel mapping data; + * in that capacity, each cell must be able to store zero to the number of + * desired colors. 16 bits/cell is plenty for that too.) + * Since the JPEG code is intended to run in small memory model on 80x86 + * machines, we can't just allocate the histogram in one chunk. Instead + * of a true 3-D array, we use a row of pointers to 2-D arrays. Each + * pointer corresponds to a C0 value (typically 2^5 = 32 pointers) and + * each 2-D array has 2^6*2^5 = 2048 or 2^6*2^6 = 4096 entries. + */ + +#define MAXNUMCOLORS (MAXJSAMPLE+1) /* maximum size of colormap */ + +/* These will do the right thing for either R,G,B or B,G,R color order, + * but you may not like the results for other color orders. + */ +#define HIST_C0_BITS 5 /* bits of precision in R/B histogram */ +#define HIST_C1_BITS 6 /* bits of precision in G histogram */ +#define HIST_C2_BITS 5 /* bits of precision in B/R histogram */ + +/* Number of elements along histogram axes. */ +#define HIST_C0_ELEMS (1<<HIST_C0_BITS) +#define HIST_C1_ELEMS (1<<HIST_C1_BITS) +#define HIST_C2_ELEMS (1<<HIST_C2_BITS) + +/* These are the amounts to shift an input value to get a histogram index. */ +#define C0_SHIFT (BITS_IN_JSAMPLE-HIST_C0_BITS) +#define C1_SHIFT (BITS_IN_JSAMPLE-HIST_C1_BITS) +#define C2_SHIFT (BITS_IN_JSAMPLE-HIST_C2_BITS) + + +typedef UINT16 histcell; /* histogram cell; prefer an unsigned type */ + +typedef histcell *histptr; /* for pointers to histogram cells */ + +typedef histcell hist1d[HIST_C2_ELEMS]; /* typedefs for the array */ +typedef hist1d *hist2d; /* type for the 2nd-level pointers */ +typedef hist2d *hist3d; /* type for top-level pointer */ + + +/* Declarations for Floyd-Steinberg dithering. + * + * Errors are accumulated into the array fserrors[], at a resolution of + * 1/16th of a pixel count. The error at a given pixel is propagated + * to its not-yet-processed neighbors using the standard F-S fractions, + * ... (here) 7/16 + * 3/16 5/16 1/16 + * We work left-to-right on even rows, right-to-left on odd rows. + * + * We can get away with a single array (holding one row's worth of errors) + * by using it to store the current row's errors at pixel columns not yet + * processed, but the next row's errors at columns already processed. We + * need only a few extra variables to hold the errors immediately around the + * current column. (If we are lucky, those variables are in registers, but + * even if not, they're probably cheaper to access than array elements are.) + * + * The fserrors[] array has (#columns + 2) entries; the extra entry at + * each end saves us from special-casing the first and last pixels. + * Each entry is three values long, one value for each color component. + */ + +#if BITS_IN_JSAMPLE == 8 +typedef INT16 FSERROR; /* 16 bits should be enough */ +typedef int LOCFSERROR; /* use 'int' for calculation temps */ +#else +typedef JLONG FSERROR; /* may need more than 16 bits */ +typedef JLONG LOCFSERROR; /* be sure calculation temps are big enough */ +#endif + +typedef FSERROR *FSERRPTR; /* pointer to error array */ + + +/* Private subobject */ + +typedef struct { + struct jpeg_color_quantizer pub; /* public fields */ + + /* Space for the eventually created colormap is stashed here */ + JSAMPARRAY sv_colormap; /* colormap allocated at init time */ + int desired; /* desired # of colors = size of colormap */ + + /* Variables for accumulating image statistics */ + hist3d histogram; /* pointer to the histogram */ + + boolean needs_zeroed; /* TRUE if next pass must zero histogram */ + + /* Variables for Floyd-Steinberg dithering */ + FSERRPTR fserrors; /* accumulated errors */ + boolean on_odd_row; /* flag to remember which row we are on */ + int *error_limiter; /* table for clamping the applied error */ +} my_cquantizer; + +typedef my_cquantizer *my_cquantize_ptr; + + +/* + * Prescan some rows of pixels. + * In this module the prescan simply updates the histogram, which has been + * initialized to zeroes by start_pass. + * An output_buf parameter is required by the method signature, but no data + * is actually output (in fact the buffer controller is probably passing a + * NULL pointer). + */ + +METHODDEF(void) +prescan_quantize (j_decompress_ptr cinfo, JSAMPARRAY input_buf, + JSAMPARRAY output_buf, int num_rows) +{ + my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; + register JSAMPROW ptr; + register histptr histp; + register hist3d histogram = cquantize->histogram; + int row; + JDIMENSION col; + JDIMENSION width = cinfo->output_width; + + for (row = 0; row < num_rows; row++) { + ptr = input_buf[row]; + for (col = width; col > 0; col--) { + /* get pixel value and index into the histogram */ + histp = & histogram[GETJSAMPLE(ptr[0]) >> C0_SHIFT] + [GETJSAMPLE(ptr[1]) >> C1_SHIFT] + [GETJSAMPLE(ptr[2]) >> C2_SHIFT]; + /* increment, check for overflow and undo increment if so. */ + if (++(*histp) <= 0) + (*histp)--; + ptr += 3; + } + } +} + + +/* + * Next we have the really interesting routines: selection of a colormap + * given the completed histogram. + * These routines work with a list of "boxes", each representing a rectangular + * subset of the input color space (to histogram precision). + */ + +typedef struct { + /* The bounds of the box (inclusive); expressed as histogram indexes */ + int c0min, c0max; + int c1min, c1max; + int c2min, c2max; + /* The volume (actually 2-norm) of the box */ + JLONG volume; + /* The number of nonzero histogram cells within this box */ + long colorcount; +} box; + +typedef box *boxptr; + + +LOCAL(boxptr) +find_biggest_color_pop (boxptr boxlist, int numboxes) +/* Find the splittable box with the largest color population */ +/* Returns NULL if no splittable boxes remain */ +{ + register boxptr boxp; + register int i; + register long maxc = 0; + boxptr which = NULL; + + for (i = 0, boxp = boxlist; i < numboxes; i++, boxp++) { + if (boxp->colorcount > maxc && boxp->volume > 0) { + which = boxp; + maxc = boxp->colorcount; + } + } + return which; +} + + +LOCAL(boxptr) +find_biggest_volume (boxptr boxlist, int numboxes) +/* Find the splittable box with the largest (scaled) volume */ +/* Returns NULL if no splittable boxes remain */ +{ + register boxptr boxp; + register int i; + register JLONG maxv = 0; + boxptr which = NULL; + + for (i = 0, boxp = boxlist; i < numboxes; i++, boxp++) { + if (boxp->volume > maxv) { + which = boxp; + maxv = boxp->volume; + } + } + return which; +} + + +LOCAL(void) +update_box (j_decompress_ptr cinfo, boxptr boxp) +/* Shrink the min/max bounds of a box to enclose only nonzero elements, */ +/* and recompute its volume and population */ +{ + my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; + hist3d histogram = cquantize->histogram; + histptr histp; + int c0,c1,c2; + int c0min,c0max,c1min,c1max,c2min,c2max; + JLONG dist0,dist1,dist2; + long ccount; + + c0min = boxp->c0min; c0max = boxp->c0max; + c1min = boxp->c1min; c1max = boxp->c1max; + c2min = boxp->c2min; c2max = boxp->c2max; + + if (c0max > c0min) + for (c0 = c0min; c0 <= c0max; c0++) + for (c1 = c1min; c1 <= c1max; c1++) { + histp = & histogram[c0][c1][c2min]; + for (c2 = c2min; c2 <= c2max; c2++) + if (*histp++ != 0) { + boxp->c0min = c0min = c0; + goto have_c0min; + } + } + have_c0min: + if (c0max > c0min) + for (c0 = c0max; c0 >= c0min; c0--) + for (c1 = c1min; c1 <= c1max; c1++) { + histp = & histogram[c0][c1][c2min]; + for (c2 = c2min; c2 <= c2max; c2++) + if (*histp++ != 0) { + boxp->c0max = c0max = c0; + goto have_c0max; + } + } + have_c0max: + if (c1max > c1min) + for (c1 = c1min; c1 <= c1max; c1++) + for (c0 = c0min; c0 <= c0max; c0++) { + histp = & histogram[c0][c1][c2min]; + for (c2 = c2min; c2 <= c2max; c2++) + if (*histp++ != 0) { + boxp->c1min = c1min = c1; + goto have_c1min; + } + } + have_c1min: + if (c1max > c1min) + for (c1 = c1max; c1 >= c1min; c1--) + for (c0 = c0min; c0 <= c0max; c0++) { + histp = & histogram[c0][c1][c2min]; + for (c2 = c2min; c2 <= c2max; c2++) + if (*histp++ != 0) { + boxp->c1max = c1max = c1; + goto have_c1max; + } + } + have_c1max: + if (c2max > c2min) + for (c2 = c2min; c2 <= c2max; c2++) + for (c0 = c0min; c0 <= c0max; c0++) { + histp = & histogram[c0][c1min][c2]; + for (c1 = c1min; c1 <= c1max; c1++, histp += HIST_C2_ELEMS) + if (*histp != 0) { + boxp->c2min = c2min = c2; + goto have_c2min; + } + } + have_c2min: + if (c2max > c2min) + for (c2 = c2max; c2 >= c2min; c2--) + for (c0 = c0min; c0 <= c0max; c0++) { + histp = & histogram[c0][c1min][c2]; + for (c1 = c1min; c1 <= c1max; c1++, histp += HIST_C2_ELEMS) + if (*histp != 0) { + boxp->c2max = c2max = c2; + goto have_c2max; + } + } + have_c2max: + + /* Update box volume. + * We use 2-norm rather than real volume here; this biases the method + * against making long narrow boxes, and it has the side benefit that + * a box is splittable iff norm > 0. + * Since the differences are expressed in histogram-cell units, + * we have to shift back to JSAMPLE units to get consistent distances; + * after which, we scale according to the selected distance scale factors. + */ + dist0 = ((c0max - c0min) << C0_SHIFT) * C0_SCALE; + dist1 = ((c1max - c1min) << C1_SHIFT) * C1_SCALE; + dist2 = ((c2max - c2min) << C2_SHIFT) * C2_SCALE; + boxp->volume = dist0*dist0 + dist1*dist1 + dist2*dist2; + + /* Now scan remaining volume of box and compute population */ + ccount = 0; + for (c0 = c0min; c0 <= c0max; c0++) + for (c1 = c1min; c1 <= c1max; c1++) { + histp = & histogram[c0][c1][c2min]; + for (c2 = c2min; c2 <= c2max; c2++, histp++) + if (*histp != 0) { + ccount++; + } + } + boxp->colorcount = ccount; +} + + +LOCAL(int) +median_cut (j_decompress_ptr cinfo, boxptr boxlist, int numboxes, + int desired_colors) +/* Repeatedly select and split the largest box until we have enough boxes */ +{ + int n,lb; + int c0,c1,c2,cmax; + register boxptr b1,b2; + + while (numboxes < desired_colors) { + /* Select box to split. + * Current algorithm: by population for first half, then by volume. + */ + if (numboxes*2 <= desired_colors) { + b1 = find_biggest_color_pop(boxlist, numboxes); + } else { + b1 = find_biggest_volume(boxlist, numboxes); + } + if (b1 == NULL) /* no splittable boxes left! */ + break; + b2 = &boxlist[numboxes]; /* where new box will go */ + /* Copy the color bounds to the new box. */ + b2->c0max = b1->c0max; b2->c1max = b1->c1max; b2->c2max = b1->c2max; + b2->c0min = b1->c0min; b2->c1min = b1->c1min; b2->c2min = b1->c2min; + /* Choose which axis to split the box on. + * Current algorithm: longest scaled axis. + * See notes in update_box about scaling distances. + */ + c0 = ((b1->c0max - b1->c0min) << C0_SHIFT) * C0_SCALE; + c1 = ((b1->c1max - b1->c1min) << C1_SHIFT) * C1_SCALE; + c2 = ((b1->c2max - b1->c2min) << C2_SHIFT) * C2_SCALE; + /* We want to break any ties in favor of green, then red, blue last. + * This code does the right thing for R,G,B or B,G,R color orders only. + */ + if (rgb_red[cinfo->out_color_space] == 0) { + cmax = c1; n = 1; + if (c0 > cmax) { cmax = c0; n = 0; } + if (c2 > cmax) { n = 2; } + } + else { + cmax = c1; n = 1; + if (c2 > cmax) { cmax = c2; n = 2; } + if (c0 > cmax) { n = 0; } + } + /* Choose split point along selected axis, and update box bounds. + * Current algorithm: split at halfway point. + * (Since the box has been shrunk to minimum volume, + * any split will produce two nonempty subboxes.) + * Note that lb value is max for lower box, so must be < old max. + */ + switch (n) { + case 0: + lb = (b1->c0max + b1->c0min) / 2; + b1->c0max = lb; + b2->c0min = lb+1; + break; + case 1: + lb = (b1->c1max + b1->c1min) / 2; + b1->c1max = lb; + b2->c1min = lb+1; + break; + case 2: + lb = (b1->c2max + b1->c2min) / 2; + b1->c2max = lb; + b2->c2min = lb+1; + break; + } + /* Update stats for boxes */ + update_box(cinfo, b1); + update_box(cinfo, b2); + numboxes++; + } + return numboxes; +} + + +LOCAL(void) +compute_color (j_decompress_ptr cinfo, boxptr boxp, int icolor) +/* Compute representative color for a box, put it in colormap[icolor] */ +{ + /* Current algorithm: mean weighted by pixels (not colors) */ + /* Note it is important to get the rounding correct! */ + my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; + hist3d histogram = cquantize->histogram; + histptr histp; + int c0,c1,c2; + int c0min,c0max,c1min,c1max,c2min,c2max; + long count; + long total = 0; + long c0total = 0; + long c1total = 0; + long c2total = 0; + + c0min = boxp->c0min; c0max = boxp->c0max; + c1min = boxp->c1min; c1max = boxp->c1max; + c2min = boxp->c2min; c2max = boxp->c2max; + + for (c0 = c0min; c0 <= c0max; c0++) + for (c1 = c1min; c1 <= c1max; c1++) { + histp = & histogram[c0][c1][c2min]; + for (c2 = c2min; c2 <= c2max; c2++) { + if ((count = *histp++) != 0) { + total += count; + c0total += ((c0 << C0_SHIFT) + ((1<<C0_SHIFT)>>1)) * count; + c1total += ((c1 << C1_SHIFT) + ((1<<C1_SHIFT)>>1)) * count; + c2total += ((c2 << C2_SHIFT) + ((1<<C2_SHIFT)>>1)) * count; + } + } + } + + cinfo->colormap[0][icolor] = (JSAMPLE) ((c0total + (total>>1)) / total); + cinfo->colormap[1][icolor] = (JSAMPLE) ((c1total + (total>>1)) / total); + cinfo->colormap[2][icolor] = (JSAMPLE) ((c2total + (total>>1)) / total); +} + + +LOCAL(void) +select_colors (j_decompress_ptr cinfo, int desired_colors) +/* Master routine for color selection */ +{ + boxptr boxlist; + int numboxes; + int i; + + /* Allocate workspace for box list */ + boxlist = (boxptr) (*cinfo->mem->alloc_small) + ((j_common_ptr) cinfo, JPOOL_IMAGE, desired_colors * sizeof(box)); + /* Initialize one box containing whole space */ + numboxes = 1; + boxlist[0].c0min = 0; + boxlist[0].c0max = MAXJSAMPLE >> C0_SHIFT; + boxlist[0].c1min = 0; + boxlist[0].c1max = MAXJSAMPLE >> C1_SHIFT; + boxlist[0].c2min = 0; + boxlist[0].c2max = MAXJSAMPLE >> C2_SHIFT; + /* Shrink it to actually-used volume and set its statistics */ + update_box(cinfo, & boxlist[0]); + /* Perform median-cut to produce final box list */ + numboxes = median_cut(cinfo, boxlist, numboxes, desired_colors); + /* Compute the representative color for each box, fill colormap */ + for (i = 0; i < numboxes; i++) + compute_color(cinfo, & boxlist[i], i); + cinfo->actual_number_of_colors = numboxes; + TRACEMS1(cinfo, 1, JTRC_QUANT_SELECTED, numboxes); +} + + +/* + * These routines are concerned with the time-critical task of mapping input + * colors to the nearest color in the selected colormap. + * + * We re-use the histogram space as an "inverse color map", essentially a + * cache for the results of nearest-color searches. All colors within a + * histogram cell will be mapped to the same colormap entry, namely the one + * closest to the cell's center. This may not be quite the closest entry to + * the actual input color, but it's almost as good. A zero in the cache + * indicates we haven't found the nearest color for that cell yet; the array + * is cleared to zeroes before starting the mapping pass. When we find the + * nearest color for a cell, its colormap index plus one is recorded in the + * cache for future use. The pass2 scanning routines call fill_inverse_cmap + * when they need to use an unfilled entry in the cache. + * + * Our method of efficiently finding nearest colors is based on the "locally + * sorted search" idea described by Heckbert and on the incremental distance + * calculation described by Spencer W. Thomas in chapter III.1 of Graphics + * Gems II (James Arvo, ed. Academic Press, 1991). Thomas points out that + * the distances from a given colormap entry to each cell of the histogram can + * be computed quickly using an incremental method: the differences between + * distances to adjacent cells themselves differ by a constant. This allows a + * fairly fast implementation of the "brute force" approach of computing the + * distance from every colormap entry to every histogram cell. Unfortunately, + * it needs a work array to hold the best-distance-so-far for each histogram + * cell (because the inner loop has to be over cells, not colormap entries). + * The work array elements have to be JLONGs, so the work array would need + * 256Kb at our recommended precision. This is not feasible in DOS machines. + * + * To get around these problems, we apply Thomas' method to compute the + * nearest colors for only the cells within a small subbox of the histogram. + * The work array need be only as big as the subbox, so the memory usage + * problem is solved. Furthermore, we need not fill subboxes that are never + * referenced in pass2; many images use only part of the color gamut, so a + * fair amount of work is saved. An additional advantage of this + * approach is that we can apply Heckbert's locality criterion to quickly + * eliminate colormap entries that are far away from the subbox; typically + * three-fourths of the colormap entries are rejected by Heckbert's criterion, + * and we need not compute their distances to individual cells in the subbox. + * The speed of this approach is heavily influenced by the subbox size: too + * small means too much overhead, too big loses because Heckbert's criterion + * can't eliminate as many colormap entries. Empirically the best subbox + * size seems to be about 1/512th of the histogram (1/8th in each direction). + * + * Thomas' article also describes a refined method which is asymptotically + * faster than the brute-force method, but it is also far more complex and + * cannot efficiently be applied to small subboxes. It is therefore not + * useful for programs intended to be portable to DOS machines. On machines + * with plenty of memory, filling the whole histogram in one shot with Thomas' + * refined method might be faster than the present code --- but then again, + * it might not be any faster, and it's certainly more complicated. + */ + + +/* log2(histogram cells in update box) for each axis; this can be adjusted */ +#define BOX_C0_LOG (HIST_C0_BITS-3) +#define BOX_C1_LOG (HIST_C1_BITS-3) +#define BOX_C2_LOG (HIST_C2_BITS-3) + +#define BOX_C0_ELEMS (1<<BOX_C0_LOG) /* # of hist cells in update box */ +#define BOX_C1_ELEMS (1<<BOX_C1_LOG) +#define BOX_C2_ELEMS (1<<BOX_C2_LOG) + +#define BOX_C0_SHIFT (C0_SHIFT + BOX_C0_LOG) +#define BOX_C1_SHIFT (C1_SHIFT + BOX_C1_LOG) +#define BOX_C2_SHIFT (C2_SHIFT + BOX_C2_LOG) + + +/* + * The next three routines implement inverse colormap filling. They could + * all be folded into one big routine, but splitting them up this way saves + * some stack space (the mindist[] and bestdist[] arrays need not coexist) + * and may allow some compilers to produce better code by registerizing more + * inner-loop variables. + */ + +LOCAL(int) +find_nearby_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2, + JSAMPLE colorlist[]) +/* Locate the colormap entries close enough to an update box to be candidates + * for the nearest entry to some cell(s) in the update box. The update box + * is specified by the center coordinates of its first cell. The number of + * candidate colormap entries is returned, and their colormap indexes are + * placed in colorlist[]. + * This routine uses Heckbert's "locally sorted search" criterion to select + * the colors that need further consideration. + */ +{ + int numcolors = cinfo->actual_number_of_colors; + int maxc0, maxc1, maxc2; + int centerc0, centerc1, centerc2; + int i, x, ncolors; + JLONG minmaxdist, min_dist, max_dist, tdist; + JLONG mindist[MAXNUMCOLORS]; /* min distance to colormap entry i */ + + /* Compute true coordinates of update box's upper corner and center. + * Actually we compute the coordinates of the center of the upper-corner + * histogram cell, which are the upper bounds of the volume we care about. + * Note that since ">>" rounds down, the "center" values may be closer to + * min than to max; hence comparisons to them must be "<=", not "<". + */ + maxc0 = minc0 + ((1 << BOX_C0_SHIFT) - (1 << C0_SHIFT)); + centerc0 = (minc0 + maxc0) >> 1; + maxc1 = minc1 + ((1 << BOX_C1_SHIFT) - (1 << C1_SHIFT)); + centerc1 = (minc1 + maxc1) >> 1; + maxc2 = minc2 + ((1 << BOX_C2_SHIFT) - (1 << C2_SHIFT)); + centerc2 = (minc2 + maxc2) >> 1; + + /* For each color in colormap, find: + * 1. its minimum squared-distance to any point in the update box + * (zero if color is within update box); + * 2. its maximum squared-distance to any point in the update box. + * Both of these can be found by considering only the corners of the box. + * We save the minimum distance for each color in mindist[]; + * only the smallest maximum distance is of interest. + */ + minmaxdist = 0x7FFFFFFFL; + + for (i = 0; i < numcolors; i++) { + /* We compute the squared-c0-distance term, then add in the other two. */ + x = GETJSAMPLE(cinfo->colormap[0][i]); + if (x < minc0) { + tdist = (x - minc0) * C0_SCALE; + min_dist = tdist*tdist; + tdist = (x - maxc0) * C0_SCALE; + max_dist = tdist*tdist; + } else if (x > maxc0) { + tdist = (x - maxc0) * C0_SCALE; + min_dist = tdist*tdist; + tdist = (x - minc0) * C0_SCALE; + max_dist = tdist*tdist; + } else { + /* within cell range so no contribution to min_dist */ + min_dist = 0; + if (x <= centerc0) { + tdist = (x - maxc0) * C0_SCALE; + max_dist = tdist*tdist; + } else { + tdist = (x - minc0) * C0_SCALE; + max_dist = tdist*tdist; + } + } + + x = GETJSAMPLE(cinfo->colormap[1][i]); + if (x < minc1) { + tdist = (x - minc1) * C1_SCALE; + min_dist += tdist*tdist; + tdist = (x - maxc1) * C1_SCALE; + max_dist += tdist*tdist; + } else if (x > maxc1) { + tdist = (x - maxc1) * C1_SCALE; + min_dist += tdist*tdist; + tdist = (x - minc1) * C1_SCALE; + max_dist += tdist*tdist; + } else { + /* within cell range so no contribution to min_dist */ + if (x <= centerc1) { + tdist = (x - maxc1) * C1_SCALE; + max_dist += tdist*tdist; + } else { + tdist = (x - minc1) * C1_SCALE; + max_dist += tdist*tdist; + } + } + + x = GETJSAMPLE(cinfo->colormap[2][i]); + if (x < minc2) { + tdist = (x - minc2) * C2_SCALE; + min_dist += tdist*tdist; + tdist = (x - maxc2) * C2_SCALE; + max_dist += tdist*tdist; + } else if (x > maxc2) { + tdist = (x - maxc2) * C2_SCALE; + min_dist += tdist*tdist; + tdist = (x - minc2) * C2_SCALE; + max_dist += tdist*tdist; + } else { + /* within cell range so no contribution to min_dist */ + if (x <= centerc2) { + tdist = (x - maxc2) * C2_SCALE; + max_dist += tdist*tdist; + } else { + tdist = (x - minc2) * C2_SCALE; + max_dist += tdist*tdist; + } + } + + mindist[i] = min_dist; /* save away the results */ + if (max_dist < minmaxdist) + minmaxdist = max_dist; + } + + /* Now we know that no cell in the update box is more than minmaxdist + * away from some colormap entry. Therefore, only colors that are + * within minmaxdist of some part of the box need be considered. + */ + ncolors = 0; + for (i = 0; i < numcolors; i++) { + if (mindist[i] <= minmaxdist) + colorlist[ncolors++] = (JSAMPLE) i; + } + return ncolors; +} + + +LOCAL(void) +find_best_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2, + int numcolors, JSAMPLE colorlist[], JSAMPLE bestcolor[]) +/* Find the closest colormap entry for each cell in the update box, + * given the list of candidate colors prepared by find_nearby_colors. + * Return the indexes of the closest entries in the bestcolor[] array. + * This routine uses Thomas' incremental distance calculation method to + * find the distance from a colormap entry to successive cells in the box. + */ +{ + int ic0, ic1, ic2; + int i, icolor; + register JLONG *bptr; /* pointer into bestdist[] array */ + JSAMPLE *cptr; /* pointer into bestcolor[] array */ + JLONG dist0, dist1; /* initial distance values */ + register JLONG dist2; /* current distance in inner loop */ + JLONG xx0, xx1; /* distance increments */ + register JLONG xx2; + JLONG inc0, inc1, inc2; /* initial values for increments */ + /* This array holds the distance to the nearest-so-far color for each cell */ + JLONG bestdist[BOX_C0_ELEMS * BOX_C1_ELEMS * BOX_C2_ELEMS]; + + /* Initialize best-distance for each cell of the update box */ + bptr = bestdist; + for (i = BOX_C0_ELEMS*BOX_C1_ELEMS*BOX_C2_ELEMS-1; i >= 0; i--) + *bptr++ = 0x7FFFFFFFL; + + /* For each color selected by find_nearby_colors, + * compute its distance to the center of each cell in the box. + * If that's less than best-so-far, update best distance and color number. + */ + + /* Nominal steps between cell centers ("x" in Thomas article) */ +#define STEP_C0 ((1 << C0_SHIFT) * C0_SCALE) +#define STEP_C1 ((1 << C1_SHIFT) * C1_SCALE) +#define STEP_C2 ((1 << C2_SHIFT) * C2_SCALE) + + for (i = 0; i < numcolors; i++) { + icolor = GETJSAMPLE(colorlist[i]); + /* Compute (square of) distance from minc0/c1/c2 to this color */ + inc0 = (minc0 - GETJSAMPLE(cinfo->colormap[0][icolor])) * C0_SCALE; + dist0 = inc0*inc0; + inc1 = (minc1 - GETJSAMPLE(cinfo->colormap[1][icolor])) * C1_SCALE; + dist0 += inc1*inc1; + inc2 = (minc2 - GETJSAMPLE(cinfo->colormap[2][icolor])) * C2_SCALE; + dist0 += inc2*inc2; + /* Form the initial difference increments */ + inc0 = inc0 * (2 * STEP_C0) + STEP_C0 * STEP_C0; + inc1 = inc1 * (2 * STEP_C1) + STEP_C1 * STEP_C1; + inc2 = inc2 * (2 * STEP_C2) + STEP_C2 * STEP_C2; + /* Now loop over all cells in box, updating distance per Thomas method */ + bptr = bestdist; + cptr = bestcolor; + xx0 = inc0; + for (ic0 = BOX_C0_ELEMS-1; ic0 >= 0; ic0--) { + dist1 = dist0; + xx1 = inc1; + for (ic1 = BOX_C1_ELEMS-1; ic1 >= 0; ic1--) { + dist2 = dist1; + xx2 = inc2; + for (ic2 = BOX_C2_ELEMS-1; ic2 >= 0; ic2--) { + if (dist2 < *bptr) { + *bptr = dist2; + *cptr = (JSAMPLE) icolor; + } + dist2 += xx2; + xx2 += 2 * STEP_C2 * STEP_C2; + bptr++; + cptr++; + } + dist1 += xx1; + xx1 += 2 * STEP_C1 * STEP_C1; + } + dist0 += xx0; + xx0 += 2 * STEP_C0 * STEP_C0; + } + } +} + + +LOCAL(void) +fill_inverse_cmap (j_decompress_ptr cinfo, int c0, int c1, int c2) +/* Fill the inverse-colormap entries in the update box that contains */ +/* histogram cell c0/c1/c2. (Only that one cell MUST be filled, but */ +/* we can fill as many others as we wish.) */ +{ + my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; + hist3d histogram = cquantize->histogram; + int minc0, minc1, minc2; /* lower left corner of update box */ + int ic0, ic1, ic2; + register JSAMPLE *cptr; /* pointer into bestcolor[] array */ + register histptr cachep; /* pointer into main cache array */ + /* This array lists the candidate colormap indexes. */ + JSAMPLE colorlist[MAXNUMCOLORS]; + int numcolors; /* number of candidate colors */ + /* This array holds the actually closest colormap index for each cell. */ + JSAMPLE bestcolor[BOX_C0_ELEMS * BOX_C1_ELEMS * BOX_C2_ELEMS]; + + /* Convert cell coordinates to update box ID */ + c0 >>= BOX_C0_LOG; + c1 >>= BOX_C1_LOG; + c2 >>= BOX_C2_LOG; + + /* Compute true coordinates of update box's origin corner. + * Actually we compute the coordinates of the center of the corner + * histogram cell, which are the lower bounds of the volume we care about. + */ + minc0 = (c0 << BOX_C0_SHIFT) + ((1 << C0_SHIFT) >> 1); + minc1 = (c1 << BOX_C1_SHIFT) + ((1 << C1_SHIFT) >> 1); + minc2 = (c2 << BOX_C2_SHIFT) + ((1 << C2_SHIFT) >> 1); + + /* Determine which colormap entries are close enough to be candidates + * for the nearest entry to some cell in the update box. + */ + numcolors = find_nearby_colors(cinfo, minc0, minc1, minc2, colorlist); + + /* Determine the actually nearest colors. */ + find_best_colors(cinfo, minc0, minc1, minc2, numcolors, colorlist, + bestcolor); + + /* Save the best color numbers (plus 1) in the main cache array */ + c0 <<= BOX_C0_LOG; /* convert ID back to base cell indexes */ + c1 <<= BOX_C1_LOG; + c2 <<= BOX_C2_LOG; + cptr = bestcolor; + for (ic0 = 0; ic0 < BOX_C0_ELEMS; ic0++) { + for (ic1 = 0; ic1 < BOX_C1_ELEMS; ic1++) { + cachep = & histogram[c0+ic0][c1+ic1][c2]; + for (ic2 = 0; ic2 < BOX_C2_ELEMS; ic2++) { + *cachep++ = (histcell) (GETJSAMPLE(*cptr++) + 1); + } + } + } +} + + +/* + * Map some rows of pixels to the output colormapped representation. + */ + +METHODDEF(void) +pass2_no_dither (j_decompress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPARRAY output_buf, int num_rows) +/* This version performs no dithering */ +{ + my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; + hist3d histogram = cquantize->histogram; + register JSAMPROW inptr, outptr; + register histptr cachep; + register int c0, c1, c2; + int row; + JDIMENSION col; + JDIMENSION width = cinfo->output_width; + + for (row = 0; row < num_rows; row++) { + inptr = input_buf[row]; + outptr = output_buf[row]; + for (col = width; col > 0; col--) { + /* get pixel value and index into the cache */ + c0 = GETJSAMPLE(*inptr++) >> C0_SHIFT; + c1 = GETJSAMPLE(*inptr++) >> C1_SHIFT; + c2 = GETJSAMPLE(*inptr++) >> C2_SHIFT; + cachep = & histogram[c0][c1][c2]; + /* If we have not seen this color before, find nearest colormap entry */ + /* and update the cache */ + if (*cachep == 0) + fill_inverse_cmap(cinfo, c0,c1,c2); + /* Now emit the colormap index for this cell */ + *outptr++ = (JSAMPLE) (*cachep - 1); + } + } +} + + +METHODDEF(void) +pass2_fs_dither (j_decompress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPARRAY output_buf, int num_rows) +/* This version performs Floyd-Steinberg dithering */ +{ + my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; + hist3d histogram = cquantize->histogram; + register LOCFSERROR cur0, cur1, cur2; /* current error or pixel value */ + LOCFSERROR belowerr0, belowerr1, belowerr2; /* error for pixel below cur */ + LOCFSERROR bpreverr0, bpreverr1, bpreverr2; /* error for below/prev col */ + register FSERRPTR errorptr; /* => fserrors[] at column before current */ + JSAMPROW inptr; /* => current input pixel */ + JSAMPROW outptr; /* => current output pixel */ + histptr cachep; + int dir; /* +1 or -1 depending on direction */ + int dir3; /* 3*dir, for advancing inptr & errorptr */ + int row; + JDIMENSION col; + JDIMENSION width = cinfo->output_width; + JSAMPLE *range_limit = cinfo->sample_range_limit; + int *error_limit = cquantize->error_limiter; + JSAMPROW colormap0 = cinfo->colormap[0]; + JSAMPROW colormap1 = cinfo->colormap[1]; + JSAMPROW colormap2 = cinfo->colormap[2]; + SHIFT_TEMPS + + for (row = 0; row < num_rows; row++) { + inptr = input_buf[row]; + outptr = output_buf[row]; + if (cquantize->on_odd_row) { + /* work right to left in this row */ + inptr += (width-1) * 3; /* so point to rightmost pixel */ + outptr += width-1; + dir = -1; + dir3 = -3; + errorptr = cquantize->fserrors + (width+1)*3; /* => entry after last column */ + cquantize->on_odd_row = FALSE; /* flip for next time */ + } else { + /* work left to right in this row */ + dir = 1; + dir3 = 3; + errorptr = cquantize->fserrors; /* => entry before first real column */ + cquantize->on_odd_row = TRUE; /* flip for next time */ + } + /* Preset error values: no error propagated to first pixel from left */ + cur0 = cur1 = cur2 = 0; + /* and no error propagated to row below yet */ + belowerr0 = belowerr1 = belowerr2 = 0; + bpreverr0 = bpreverr1 = bpreverr2 = 0; + + for (col = width; col > 0; col--) { + /* curN holds the error propagated from the previous pixel on the + * current line. Add the error propagated from the previous line + * to form the complete error correction term for this pixel, and + * round the error term (which is expressed * 16) to an integer. + * RIGHT_SHIFT rounds towards minus infinity, so adding 8 is correct + * for either sign of the error value. + * Note: errorptr points to *previous* column's array entry. + */ + cur0 = RIGHT_SHIFT(cur0 + errorptr[dir3+0] + 8, 4); + cur1 = RIGHT_SHIFT(cur1 + errorptr[dir3+1] + 8, 4); + cur2 = RIGHT_SHIFT(cur2 + errorptr[dir3+2] + 8, 4); + /* Limit the error using transfer function set by init_error_limit. + * See comments with init_error_limit for rationale. + */ + cur0 = error_limit[cur0]; + cur1 = error_limit[cur1]; + cur2 = error_limit[cur2]; + /* Form pixel value + error, and range-limit to 0..MAXJSAMPLE. + * The maximum error is +- MAXJSAMPLE (or less with error limiting); + * this sets the required size of the range_limit array. + */ + cur0 += GETJSAMPLE(inptr[0]); + cur1 += GETJSAMPLE(inptr[1]); + cur2 += GETJSAMPLE(inptr[2]); + cur0 = GETJSAMPLE(range_limit[cur0]); + cur1 = GETJSAMPLE(range_limit[cur1]); + cur2 = GETJSAMPLE(range_limit[cur2]); + /* Index into the cache with adjusted pixel value */ + cachep = & histogram[cur0>>C0_SHIFT][cur1>>C1_SHIFT][cur2>>C2_SHIFT]; + /* If we have not seen this color before, find nearest colormap */ + /* entry and update the cache */ + if (*cachep == 0) + fill_inverse_cmap(cinfo, cur0>>C0_SHIFT,cur1>>C1_SHIFT,cur2>>C2_SHIFT); + /* Now emit the colormap index for this cell */ + { register int pixcode = *cachep - 1; + *outptr = (JSAMPLE) pixcode; + /* Compute representation error for this pixel */ + cur0 -= GETJSAMPLE(colormap0[pixcode]); + cur1 -= GETJSAMPLE(colormap1[pixcode]); + cur2 -= GETJSAMPLE(colormap2[pixcode]); + } + /* Compute error fractions to be propagated to adjacent pixels. + * Add these into the running sums, and simultaneously shift the + * next-line error sums left by 1 column. + */ + { register LOCFSERROR bnexterr; + + bnexterr = cur0; /* Process component 0 */ + errorptr[0] = (FSERROR) (bpreverr0 + cur0 * 3); + bpreverr0 = belowerr0 + cur0 * 5; + belowerr0 = bnexterr; + cur0 *= 7; + bnexterr = cur1; /* Process component 1 */ + errorptr[1] = (FSERROR) (bpreverr1 + cur1 * 3); + bpreverr1 = belowerr1 + cur1 * 5; + belowerr1 = bnexterr; + cur1 *= 7; + bnexterr = cur2; /* Process component 2 */ + errorptr[2] = (FSERROR) (bpreverr2 + cur2 * 3); + bpreverr2 = belowerr2 + cur2 * 5; + belowerr2 = bnexterr; + cur2 *= 7; + } + /* At this point curN contains the 7/16 error value to be propagated + * to the next pixel on the current line, and all the errors for the + * next line have been shifted over. We are therefore ready to move on. + */ + inptr += dir3; /* Advance pixel pointers to next column */ + outptr += dir; + errorptr += dir3; /* advance errorptr to current column */ + } + /* Post-loop cleanup: we must unload the final error values into the + * final fserrors[] entry. Note we need not unload belowerrN because + * it is for the dummy column before or after the actual array. + */ + errorptr[0] = (FSERROR) bpreverr0; /* unload prev errs into array */ + errorptr[1] = (FSERROR) bpreverr1; + errorptr[2] = (FSERROR) bpreverr2; + } +} + + +/* + * Initialize the error-limiting transfer function (lookup table). + * The raw F-S error computation can potentially compute error values of up to + * +- MAXJSAMPLE. But we want the maximum correction applied to a pixel to be + * much less, otherwise obviously wrong pixels will be created. (Typical + * effects include weird fringes at color-area boundaries, isolated bright + * pixels in a dark area, etc.) The standard advice for avoiding this problem + * is to ensure that the "corners" of the color cube are allocated as output + * colors; then repeated errors in the same direction cannot cause cascading + * error buildup. However, that only prevents the error from getting + * completely out of hand; Aaron Giles reports that error limiting improves + * the results even with corner colors allocated. + * A simple clamping of the error values to about +- MAXJSAMPLE/8 works pretty + * well, but the smoother transfer function used below is even better. Thanks + * to Aaron Giles for this idea. + */ + +LOCAL(void) +init_error_limit (j_decompress_ptr cinfo) +/* Allocate and fill in the error_limiter table */ +{ + my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; + int *table; + int in, out; + + table = (int *) (*cinfo->mem->alloc_small) + ((j_common_ptr) cinfo, JPOOL_IMAGE, (MAXJSAMPLE*2+1) * sizeof(int)); + table += MAXJSAMPLE; /* so can index -MAXJSAMPLE .. +MAXJSAMPLE */ + cquantize->error_limiter = table; + +#define STEPSIZE ((MAXJSAMPLE+1)/16) + /* Map errors 1:1 up to +- MAXJSAMPLE/16 */ + out = 0; + for (in = 0; in < STEPSIZE; in++, out++) { + table[in] = out; table[-in] = -out; + } + /* Map errors 1:2 up to +- 3*MAXJSAMPLE/16 */ + for (; in < STEPSIZE*3; in++, out += (in&1) ? 0 : 1) { + table[in] = out; table[-in] = -out; + } + /* Clamp the rest to final out value (which is (MAXJSAMPLE+1)/8) */ + for (; in <= MAXJSAMPLE; in++) { + table[in] = out; table[-in] = -out; + } +#undef STEPSIZE +} + + +/* + * Finish up at the end of each pass. + */ + +METHODDEF(void) +finish_pass1 (j_decompress_ptr cinfo) +{ + my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; + + /* Select the representative colors and fill in cinfo->colormap */ + cinfo->colormap = cquantize->sv_colormap; + select_colors(cinfo, cquantize->desired); + /* Force next pass to zero the color index table */ + cquantize->needs_zeroed = TRUE; +} + + +METHODDEF(void) +finish_pass2 (j_decompress_ptr cinfo) +{ + /* no work */ +} + + +/* + * Initialize for each processing pass. + */ + +METHODDEF(void) +start_pass_2_quant (j_decompress_ptr cinfo, boolean is_pre_scan) +{ + my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; + hist3d histogram = cquantize->histogram; + int i; + + /* Only F-S dithering or no dithering is supported. */ + /* If user asks for ordered dither, give him F-S. */ + if (cinfo->dither_mode != JDITHER_NONE) + cinfo->dither_mode = JDITHER_FS; + + if (is_pre_scan) { + /* Set up method pointers */ + cquantize->pub.color_quantize = prescan_quantize; + cquantize->pub.finish_pass = finish_pass1; + cquantize->needs_zeroed = TRUE; /* Always zero histogram */ + } else { + /* Set up method pointers */ + if (cinfo->dither_mode == JDITHER_FS) + cquantize->pub.color_quantize = pass2_fs_dither; + else + cquantize->pub.color_quantize = pass2_no_dither; + cquantize->pub.finish_pass = finish_pass2; + + /* Make sure color count is acceptable */ + i = cinfo->actual_number_of_colors; + if (i < 1) + ERREXIT1(cinfo, JERR_QUANT_FEW_COLORS, 1); + if (i > MAXNUMCOLORS) + ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXNUMCOLORS); + + if (cinfo->dither_mode == JDITHER_FS) { + size_t arraysize = (size_t) ((cinfo->output_width + 2) * + (3 * sizeof(FSERROR))); + /* Allocate Floyd-Steinberg workspace if we didn't already. */ + if (cquantize->fserrors == NULL) + cquantize->fserrors = (FSERRPTR) (*cinfo->mem->alloc_large) + ((j_common_ptr) cinfo, JPOOL_IMAGE, arraysize); + /* Initialize the propagated errors to zero. */ + jzero_far((void *) cquantize->fserrors, arraysize); + /* Make the error-limit table if we didn't already. */ + if (cquantize->error_limiter == NULL) + init_error_limit(cinfo); + cquantize->on_odd_row = FALSE; + } + + } + /* Zero the histogram or inverse color map, if necessary */ + if (cquantize->needs_zeroed) { + for (i = 0; i < HIST_C0_ELEMS; i++) { + jzero_far((void *) histogram[i], + HIST_C1_ELEMS*HIST_C2_ELEMS * sizeof(histcell)); + } + cquantize->needs_zeroed = FALSE; + } +} + + +/* + * Switch to a new external colormap between output passes. + */ + +METHODDEF(void) +new_color_map_2_quant (j_decompress_ptr cinfo) +{ + my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; + + /* Reset the inverse color map */ + cquantize->needs_zeroed = TRUE; +} + + +/* + * Module initialization routine for 2-pass color quantization. + */ + +GLOBAL(void) +jinit_2pass_quantizer (j_decompress_ptr cinfo) +{ + my_cquantize_ptr cquantize; + int i; + + cquantize = (my_cquantize_ptr) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + sizeof(my_cquantizer)); + cinfo->cquantize = (struct jpeg_color_quantizer *) cquantize; + cquantize->pub.start_pass = start_pass_2_quant; + cquantize->pub.new_color_map = new_color_map_2_quant; + cquantize->fserrors = NULL; /* flag optional arrays not allocated */ + cquantize->error_limiter = NULL; + + /* Make sure jdmaster didn't give me a case I can't handle */ + if (cinfo->out_color_components != 3) + ERREXIT(cinfo, JERR_NOTIMPL); + + /* Allocate the histogram/inverse colormap storage */ + cquantize->histogram = (hist3d) (*cinfo->mem->alloc_small) + ((j_common_ptr) cinfo, JPOOL_IMAGE, HIST_C0_ELEMS * sizeof(hist2d)); + for (i = 0; i < HIST_C0_ELEMS; i++) { + cquantize->histogram[i] = (hist2d) (*cinfo->mem->alloc_large) + ((j_common_ptr) cinfo, JPOOL_IMAGE, + HIST_C1_ELEMS*HIST_C2_ELEMS * sizeof(histcell)); + } + cquantize->needs_zeroed = TRUE; /* histogram is garbage now */ + + /* Allocate storage for the completed colormap, if required. + * We do this now since it may affect the memory manager's space + * calculations. + */ + if (cinfo->enable_2pass_quant) { + /* Make sure color count is acceptable */ + int desired = cinfo->desired_number_of_colors; + /* Lower bound on # of colors ... somewhat arbitrary as long as > 0 */ + if (desired < 8) + ERREXIT1(cinfo, JERR_QUANT_FEW_COLORS, 8); + /* Make sure colormap indexes can be represented by JSAMPLEs */ + if (desired > MAXNUMCOLORS) + ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXNUMCOLORS); + cquantize->sv_colormap = (*cinfo->mem->alloc_sarray) + ((j_common_ptr) cinfo,JPOOL_IMAGE, (JDIMENSION) desired, (JDIMENSION) 3); + cquantize->desired = desired; + } else + cquantize->sv_colormap = NULL; + + /* Only F-S dithering or no dithering is supported. */ + /* If user asks for ordered dither, give him F-S. */ + if (cinfo->dither_mode != JDITHER_NONE) + cinfo->dither_mode = JDITHER_FS; + + /* Allocate Floyd-Steinberg workspace if necessary. + * This isn't really needed until pass 2, but again it may affect the memory + * manager's space calculations. Although we will cope with a later change + * in dither_mode, we do not promise to honor max_memory_to_use if + * dither_mode changes. + */ + if (cinfo->dither_mode == JDITHER_FS) { + cquantize->fserrors = (FSERRPTR) (*cinfo->mem->alloc_large) + ((j_common_ptr) cinfo, JPOOL_IMAGE, + (size_t) ((cinfo->output_width + 2) * (3 * sizeof(FSERROR)))); + /* Might as well create the error-limiting table too. */ + init_error_limit(cinfo); + } +} + +#endif /* QUANT_2PASS_SUPPORTED */ diff --git a/media/libjpeg/jsimd.h b/media/libjpeg/jsimd.h new file mode 100644 index 000000000..3aa0779b8 --- /dev/null +++ b/media/libjpeg/jsimd.h @@ -0,0 +1,93 @@ +/* + * jsimd.h + * + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2011, 2014, D. R. Commander. + * Copyright (C) 2015, Matthieu Darbois. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + */ + +#include "jchuff.h" /* Declarations shared with jcphuff.c */ + +EXTERN(int) jsimd_can_rgb_ycc (void); +EXTERN(int) jsimd_can_rgb_gray (void); +EXTERN(int) jsimd_can_ycc_rgb (void); +EXTERN(int) jsimd_can_ycc_rgb565 (void); +EXTERN(int) jsimd_c_can_null_convert (void); + +EXTERN(void) jsimd_rgb_ycc_convert + (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_rgb_gray_convert + (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_ycc_rgb_convert + (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_rgb565_convert + (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_c_null_convert + (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(int) jsimd_can_h2v2_downsample (void); +EXTERN(int) jsimd_can_h2v1_downsample (void); + +EXTERN(void) jsimd_h2v2_downsample + (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(int) jsimd_can_h2v2_smooth_downsample (void); + +EXTERN(void) jsimd_h2v2_smooth_downsample + (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v1_downsample + (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(int) jsimd_can_h2v2_upsample (void); +EXTERN(int) jsimd_can_h2v1_upsample (void); +EXTERN(int) jsimd_can_int_upsample (void); + +EXTERN(void) jsimd_h2v2_upsample + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v1_upsample + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_int_upsample + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); + +EXTERN(int) jsimd_can_h2v2_fancy_upsample (void); +EXTERN(int) jsimd_can_h2v1_fancy_upsample (void); + +EXTERN(void) jsimd_h2v2_fancy_upsample + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v1_fancy_upsample + (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); + +EXTERN(int) jsimd_can_h2v2_merged_upsample (void); +EXTERN(int) jsimd_can_h2v1_merged_upsample (void); + +EXTERN(void) jsimd_h2v2_merged_upsample + (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_merged_upsample + (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); + +EXTERN(int) jsimd_can_huff_encode_one_block (void); + +EXTERN(JOCTET*) jsimd_huff_encode_one_block + (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val, + c_derived_tbl *dctbl, c_derived_tbl *actbl); diff --git a/media/libjpeg/jsimd_none.c b/media/libjpeg/jsimd_none.c new file mode 100644 index 000000000..f29030cfa --- /dev/null +++ b/media/libjpeg/jsimd_none.c @@ -0,0 +1,404 @@ +/* + * jsimd_none.c + * + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2009-2011, 2014, D. R. Commander. + * Copyright (C) 2015, Matthieu Darbois. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains stubs for when there is no SIMD support available. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jsimd.h" +#include "jdct.h" +#include "jsimddct.h" + +GLOBAL(int) +jsimd_can_rgb_ycc (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565 (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_c_can_null_convert (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ +} + +GLOBAL(void) +jsimd_rgb_gray_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ +} + +GLOBAL(void) +jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ +} + +GLOBAL(void) +jsimd_c_null_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_downsample (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v2_smooth_downsample (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ +} + +GLOBAL(void) +jsimd_h2v2_smooth_downsample (j_compress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ +} + +GLOBAL(void) +jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_upsample (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_int_upsample (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(void) +jsimd_h2v2_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(void) +jsimd_h2v1_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ +} + +GLOBAL(int) +jsimd_can_convsamp (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ +} + +GLOBAL(void) +jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_fdct_islow (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow (DCTELEM *data) +{ +} + +GLOBAL(void) +jsimd_fdct_ifast (DCTELEM *data) +{ +} + +GLOBAL(void) +jsimd_fdct_float (FAST_FLOAT *data) +{ +} + +GLOBAL(int) +jsimd_can_quantize (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors, + DCTELEM *workspace) +{ +} + +GLOBAL(void) +jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_idct_2x2 (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4 (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_idct_6x6 (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_idct_12x12 (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(void) +jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(void) +jsimd_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(void) +jsimd_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_idct_islow (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(void) +jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(void) +jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block (void) +{ + return 0; +} + +GLOBAL(JOCTET*) +jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + return NULL; +} diff --git a/media/libjpeg/jsimddct.h b/media/libjpeg/jsimddct.h new file mode 100644 index 000000000..b19ab48d4 --- /dev/null +++ b/media/libjpeg/jsimddct.h @@ -0,0 +1,74 @@ +/* + * jsimddct.h + * + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + */ + +EXTERN(int) jsimd_can_convsamp (void); +EXTERN(int) jsimd_can_convsamp_float (void); + +EXTERN(void) jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace); +EXTERN(void) jsimd_convsamp_float (JSAMPARRAY sample_data, + JDIMENSION start_col, + FAST_FLOAT *workspace); + +EXTERN(int) jsimd_can_fdct_islow (void); +EXTERN(int) jsimd_can_fdct_ifast (void); +EXTERN(int) jsimd_can_fdct_float (void); + +EXTERN(void) jsimd_fdct_islow (DCTELEM *data); +EXTERN(void) jsimd_fdct_ifast (DCTELEM *data); +EXTERN(void) jsimd_fdct_float (FAST_FLOAT *data); + +EXTERN(int) jsimd_can_quantize (void); +EXTERN(int) jsimd_can_quantize_float (void); + +EXTERN(void) jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors, + DCTELEM *workspace); +EXTERN(void) jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace); + +EXTERN(int) jsimd_can_idct_2x2 (void); +EXTERN(int) jsimd_can_idct_4x4 (void); +EXTERN(int) jsimd_can_idct_6x6 (void); +EXTERN(int) jsimd_can_idct_12x12 (void); + +EXTERN(void) jsimd_idct_2x2 (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_4x4 (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_6x6 (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_12x12 (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(int) jsimd_can_idct_islow (void); +EXTERN(int) jsimd_can_idct_ifast (void); +EXTERN(int) jsimd_can_idct_float (void); + +EXTERN(void) jsimd_idct_islow (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_ifast (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_float (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); diff --git a/media/libjpeg/jstdhuff.c b/media/libjpeg/jstdhuff.c new file mode 100644 index 000000000..e202e8e7e --- /dev/null +++ b/media/libjpeg/jstdhuff.c @@ -0,0 +1,135 @@ +/* + * jstdhuff.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1998, Thomas G. Lane. + * libjpeg-turbo Modifications: + * Copyright (C) 2013, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains routines to set the default Huffman tables, if they are + * not already set. + */ + +/* + * Huffman table setup routines + */ + +LOCAL(void) +add_huff_table (j_common_ptr cinfo, + JHUFF_TBL **htblptr, const UINT8 *bits, const UINT8 *val) +/* Define a Huffman table */ +{ + int nsymbols, len; + + if (*htblptr == NULL) + *htblptr = jpeg_alloc_huff_table(cinfo); + else + return; + + /* Copy the number-of-symbols-of-each-code-length counts */ + MEMCOPY((*htblptr)->bits, bits, sizeof((*htblptr)->bits)); + + /* Validate the counts. We do this here mainly so we can copy the right + * number of symbols from the val[] array, without risking marching off + * the end of memory. jchuff.c will do a more thorough test later. + */ + nsymbols = 0; + for (len = 1; len <= 16; len++) + nsymbols += bits[len]; + if (nsymbols < 1 || nsymbols > 256) + ERREXIT(cinfo, JERR_BAD_HUFF_TABLE); + + MEMCOPY((*htblptr)->huffval, val, nsymbols * sizeof(UINT8)); + MEMZERO(&((*htblptr)->huffval[nsymbols]), (256 - nsymbols) * sizeof(UINT8)); + + /* Initialize sent_table FALSE so table will be written to JPEG file. */ + (*htblptr)->sent_table = FALSE; +} + + +LOCAL(void) +std_huff_tables (j_common_ptr cinfo) +/* Set up the standard Huffman tables (cf. JPEG standard section K.3) */ +/* IMPORTANT: these are only valid for 8-bit data precision! */ +{ + JHUFF_TBL **dc_huff_tbl_ptrs, **ac_huff_tbl_ptrs; + + static const UINT8 bits_dc_luminance[17] = + { /* 0-base */ 0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 }; + static const UINT8 val_dc_luminance[] = + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }; + + static const UINT8 bits_dc_chrominance[17] = + { /* 0-base */ 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 }; + static const UINT8 val_dc_chrominance[] = + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }; + + static const UINT8 bits_ac_luminance[17] = + { /* 0-base */ 0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d }; + static const UINT8 val_ac_luminance[] = + { 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, + 0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07, + 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08, + 0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0, + 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16, + 0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28, + 0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, + 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, + 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, + 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, + 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, + 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, + 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, + 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, + 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2, + 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, + 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, + 0xf9, 0xfa }; + + static const UINT8 bits_ac_chrominance[17] = + { /* 0-base */ 0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77 }; + static const UINT8 val_ac_chrominance[] = + { 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21, + 0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71, + 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91, + 0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0, + 0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34, + 0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26, + 0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38, + 0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, + 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, + 0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, + 0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, + 0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, + 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, + 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, + 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, + 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, + 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, + 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, + 0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, + 0xf9, 0xfa }; + + if (cinfo->is_decompressor) { + dc_huff_tbl_ptrs = ((j_decompress_ptr)cinfo)->dc_huff_tbl_ptrs; + ac_huff_tbl_ptrs = ((j_decompress_ptr)cinfo)->ac_huff_tbl_ptrs; + } else { + dc_huff_tbl_ptrs = ((j_compress_ptr)cinfo)->dc_huff_tbl_ptrs; + ac_huff_tbl_ptrs = ((j_compress_ptr)cinfo)->ac_huff_tbl_ptrs; + } + + add_huff_table(cinfo, &dc_huff_tbl_ptrs[0], bits_dc_luminance, + val_dc_luminance); + add_huff_table(cinfo, &ac_huff_tbl_ptrs[0], bits_ac_luminance, + val_ac_luminance); + add_huff_table(cinfo, &dc_huff_tbl_ptrs[1], bits_dc_chrominance, + val_dc_chrominance); + add_huff_table(cinfo, &ac_huff_tbl_ptrs[1], bits_ac_chrominance, + val_ac_chrominance); +} diff --git a/media/libjpeg/jutils.c b/media/libjpeg/jutils.c new file mode 100644 index 000000000..f9d35023e --- /dev/null +++ b/media/libjpeg/jutils.c @@ -0,0 +1,133 @@ +/* + * jutils.c + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1996, Thomas G. Lane. + * It was modified by The libjpeg-turbo Project to include only code + * relevant to libjpeg-turbo. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains tables and miscellaneous utility routines needed + * for both compression and decompression. + * Note we prefix all global names with "j" to minimize conflicts with + * a surrounding application. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" + + +/* + * jpeg_zigzag_order[i] is the zigzag-order position of the i'th element + * of a DCT block read in natural order (left to right, top to bottom). + */ + +#if 0 /* This table is not actually needed in v6a */ + +const int jpeg_zigzag_order[DCTSIZE2] = { + 0, 1, 5, 6, 14, 15, 27, 28, + 2, 4, 7, 13, 16, 26, 29, 42, + 3, 8, 12, 17, 25, 30, 41, 43, + 9, 11, 18, 24, 31, 40, 44, 53, + 10, 19, 23, 32, 39, 45, 52, 54, + 20, 22, 33, 38, 46, 51, 55, 60, + 21, 34, 37, 47, 50, 56, 59, 61, + 35, 36, 48, 49, 57, 58, 62, 63 +}; + +#endif + +/* + * jpeg_natural_order[i] is the natural-order position of the i'th element + * of zigzag order. + * + * When reading corrupted data, the Huffman decoders could attempt + * to reference an entry beyond the end of this array (if the decoded + * zero run length reaches past the end of the block). To prevent + * wild stores without adding an inner-loop test, we put some extra + * "63"s after the real entries. This will cause the extra coefficient + * to be stored in location 63 of the block, not somewhere random. + * The worst case would be a run-length of 15, which means we need 16 + * fake entries. + */ + +const int jpeg_natural_order[DCTSIZE2+16] = { + 0, 1, 8, 16, 9, 2, 3, 10, + 17, 24, 32, 25, 18, 11, 4, 5, + 12, 19, 26, 33, 40, 48, 41, 34, + 27, 20, 13, 6, 7, 14, 21, 28, + 35, 42, 49, 56, 57, 50, 43, 36, + 29, 22, 15, 23, 30, 37, 44, 51, + 58, 59, 52, 45, 38, 31, 39, 46, + 53, 60, 61, 54, 47, 55, 62, 63, + 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */ + 63, 63, 63, 63, 63, 63, 63, 63 +}; + + +/* + * Arithmetic utilities + */ + +GLOBAL(long) +jdiv_round_up (long a, long b) +/* Compute a/b rounded up to next integer, ie, ceil(a/b) */ +/* Assumes a >= 0, b > 0 */ +{ + return (a + b - 1L) / b; +} + + +GLOBAL(long) +jround_up (long a, long b) +/* Compute a rounded up to next multiple of b, ie, ceil(a/b)*b */ +/* Assumes a >= 0, b > 0 */ +{ + a += b - 1L; + return a - (a % b); +} + + +GLOBAL(void) +jcopy_sample_rows (JSAMPARRAY input_array, int source_row, + JSAMPARRAY output_array, int dest_row, + int num_rows, JDIMENSION num_cols) +/* Copy some rows of samples from one place to another. + * num_rows rows are copied from input_array[source_row++] + * to output_array[dest_row++]; these areas may overlap for duplication. + * The source and destination arrays must be at least as wide as num_cols. + */ +{ + register JSAMPROW inptr, outptr; + register size_t count = (size_t) (num_cols * sizeof(JSAMPLE)); + register int row; + + input_array += source_row; + output_array += dest_row; + + for (row = num_rows; row > 0; row--) { + inptr = *input_array++; + outptr = *output_array++; + MEMCOPY(outptr, inptr, count); + } +} + + +GLOBAL(void) +jcopy_block_row (JBLOCKROW input_row, JBLOCKROW output_row, + JDIMENSION num_blocks) +/* Copy a row of coefficient blocks from one place to another. */ +{ + MEMCOPY(output_row, input_row, num_blocks * (DCTSIZE2 * sizeof(JCOEF))); +} + + +GLOBAL(void) +jzero_far (void *target, size_t bytestozero) +/* Zero out a chunk of memory. */ +/* This might be sample-array data, block-array data, or alloc_large data. */ +{ + MEMZERO(target, bytestozero); +} diff --git a/media/libjpeg/jversion.h b/media/libjpeg/jversion.h new file mode 100644 index 000000000..6ce663d82 --- /dev/null +++ b/media/libjpeg/jversion.h @@ -0,0 +1,49 @@ +/* + * jversion.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-2012, Thomas G. Lane, Guido Vollbeding. + * libjpeg-turbo Modifications: + * Copyright (C) 2010, 2012-2016, D. R. Commander. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains software version identification. + */ + + +#if JPEG_LIB_VERSION >= 80 + +#define JVERSION "8d 15-Jan-2012" + +#elif JPEG_LIB_VERSION >= 70 + +#define JVERSION "7 27-Jun-2009" + +#else + +#define JVERSION "6b 27-Mar-1998" + +#endif + +/* + * NOTE: It is our convention to place the authors in the following order: + * - libjpeg-turbo authors (2009-) in descending order of the date of their + * most recent contribution to the project, then in ascending order of the + * date of their first contribution to the project + * - Upstream authors in descending order of the date of the first inclusion of + * their code + */ + +#define JCOPYRIGHT "Copyright (C) 2009-2016 D. R. Commander\n" \ + "Copyright (C) 2011-2016 Siarhei Siamashka\n" \ + "Copyright (C) 2015-2016 Matthieu Darbois\n" \ + "Copyright (C) 2015 Google, Inc.\n" \ + "Copyright (C) 2013-2014 MIPS Technologies, Inc.\n" \ + "Copyright (C) 2013 Linaro Limited\n" \ + "Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \ + "Copyright (C) 2009 Pierre Ossman for Cendio AB\n" \ + "Copyright (C) 1999-2006 MIYASAKA Masaru\n" \ + "Copyright (C) 1991-2016 Thomas G. Lane, Guido Vollbeding" \ + +#define JCOPYRIGHT_SHORT "Copyright (C) 1991-2016 The libjpeg-turbo Project and many others" diff --git a/media/libjpeg/moz.build b/media/libjpeg/moz.build new file mode 100644 index 000000000..fbfe2ca62 --- /dev/null +++ b/media/libjpeg/moz.build @@ -0,0 +1,158 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +EXPORTS += [ + 'jconfig.h', + 'jerror.h', + 'jinclude.h', + 'jmorecfg.h', + 'jpegint.h', + 'jpeglib.h', +] + +SOURCES += [ + 'jcomapi.c', + 'jdapimin.c', + 'jdapistd.c', + 'jdatadst.c', + 'jdatasrc.c', + 'jdcoefct.c', + 'jdcolor.c', + 'jddctmgr.c', + 'jdhuff.c', + 'jdinput.c', + 'jdmainct.c', + 'jdmarker.c', + 'jdmaster.c', + 'jdmerge.c', + 'jdphuff.c', + 'jdpostct.c', + 'jdsample.c', + 'jdtrans.c', + 'jerror.c', + 'jfdctflt.c', + 'jfdctfst.c', + 'jfdctint.c', + 'jidctflt.c', + 'jidctfst.c', + 'jidctint.c', + 'jidctred.c', + 'jmemmgr.c', + 'jmemnobs.c', + 'jquant1.c', + 'jquant2.c', + 'jutils.c', +] + +# These files enable support for writing JPEGs +SOURCES += [ + 'jcapimin.c', + 'jcapistd.c', + 'jccoefct.c', + 'jccolor.c', + 'jcdctmgr.c', + 'jchuff.c', + 'jcinit.c', + 'jcmainct.c', + 'jcmarker.c', + 'jcmaster.c', + 'jcparam.c', + 'jcphuff.c', + 'jcprepct.c', + 'jcsample.c', + 'jctrans.c', +] + +if CONFIG['LIBJPEG_TURBO_USE_YASM']: + USE_YASM = True + +if CONFIG['LIBJPEG_TURBO_ASFLAGS']: + if CONFIG['CPU_ARCH'] == 'arm': + SOURCES += [ + 'simd/jsimd_arm.c', + 'simd/jsimd_arm_neon.S', + ] + elif CONFIG['CPU_ARCH'] == 'aarch64': + SOURCES += [ + 'simd/jsimd_arm64.c', + 'simd/jsimd_arm64_neon.S', + ] + elif CONFIG['CPU_ARCH'] == 'mips': + SOURCES += [ + 'simd/jsimd_mips.c', + 'simd/jsimd_mips_dspr2.S', + ] + elif CONFIG['CPU_ARCH'] == 'x86_64': + SOURCES += [ + 'simd/jccolor-sse2-64.asm', + 'simd/jcgray-sse2-64.asm', + 'simd/jchuff-sse2-64.asm', + 'simd/jcsample-sse2-64.asm', + 'simd/jdcolor-sse2-64.asm', + 'simd/jdmerge-sse2-64.asm', + 'simd/jdsample-sse2-64.asm', + 'simd/jfdctflt-sse-64.asm', + 'simd/jfdctfst-sse2-64.asm', + 'simd/jfdctint-sse2-64.asm', + 'simd/jidctflt-sse2-64.asm', + 'simd/jidctfst-sse2-64.asm', + 'simd/jidctint-sse2-64.asm', + 'simd/jidctred-sse2-64.asm', + 'simd/jquantf-sse2-64.asm', + 'simd/jquanti-sse2-64.asm', + 'simd/jsimd_x86_64.c', + ] + elif CONFIG['CPU_ARCH'] == 'x86': + SOURCES += [ + 'simd/jccolor-mmx.asm', + 'simd/jccolor-sse2.asm', + 'simd/jcgray-mmx.asm', + 'simd/jcgray-sse2.asm', + 'simd/jchuff-sse2.asm', + 'simd/jcsample-mmx.asm', + 'simd/jcsample-sse2.asm', + 'simd/jdcolor-mmx.asm', + 'simd/jdcolor-sse2.asm', + 'simd/jdmerge-mmx.asm', + 'simd/jdmerge-sse2.asm', + 'simd/jdsample-mmx.asm', + 'simd/jdsample-sse2.asm', + 'simd/jfdctflt-3dn.asm', + 'simd/jfdctflt-sse.asm', + 'simd/jfdctfst-mmx.asm', + 'simd/jfdctfst-sse2.asm', + 'simd/jfdctint-mmx.asm', + 'simd/jfdctint-sse2.asm', + 'simd/jidctflt-3dn.asm', + 'simd/jidctflt-sse.asm', + 'simd/jidctflt-sse2.asm', + 'simd/jidctfst-mmx.asm', + 'simd/jidctfst-sse2.asm', + 'simd/jidctint-mmx.asm', + 'simd/jidctint-sse2.asm', + 'simd/jidctred-mmx.asm', + 'simd/jidctred-sse2.asm', + 'simd/jquant-3dn.asm', + 'simd/jquant-mmx.asm', + 'simd/jquant-sse.asm', + 'simd/jquantf-sse2.asm', + 'simd/jquanti-sse2.asm', + 'simd/jsimd_i386.c', + 'simd/jsimdcpu.asm', + ] +else: # No SIMD support? + SOURCES += [ + 'jsimd_none.c', + ] + +ASFLAGS += CONFIG['LIBJPEG_TURBO_ASFLAGS'] +ASFLAGS += ['-I%s/media/libjpeg/simd/' % TOPSRCDIR] + +# We allow warnings for third-party code that can be updated from upstream. +ALLOW_COMPILER_WARNINGS = True + +FINAL_LIBRARY = 'gkmedias' + diff --git a/media/libjpeg/mozilla.diff b/media/libjpeg/mozilla.diff new file mode 100644 index 000000000..24b235b40 --- /dev/null +++ b/media/libjpeg/mozilla.diff @@ -0,0 +1,112 @@ +diff --git jmemmgr.c jmemmgr.c +--- jmemmgr.c ++++ jmemmgr.c +@@ -28,16 +28,17 @@ + */ + + #define JPEG_INTERNALS + #define AM_MEMORY_MANAGER /* we define jvirt_Xarray_control structs */ + #include "jinclude.h" + #include "jpeglib.h" + #include "jmemsys.h" /* import the system-dependent declarations */ + #include <stdint.h> ++#include <limits.h> /* some NDKs define SIZE_MAX in limits.h */ + + #ifndef NO_GETENV + #ifndef HAVE_STDLIB_H /* <stdlib.h> should declare getenv() */ + extern char *getenv (const char *name); + #endif + #endif + + +diff --git jmorecfg.h jmorecfg.h +--- jmorecfg.h ++++ jmorecfg.h +@@ -9,16 +9,17 @@ + * For conditions of distribution and use, see the accompanying README.ijg + * file. + * + * This file contains additional configuration options that customize the + * JPEG software for special applications or support machine-dependent + * optimizations. Most users will not need to touch this file. + */ + ++#include <stdint.h> + + /* + * Maximum number of components (color channels) allowed in JPEG image. + * To meet the letter of the JPEG spec, set this to 255. However, darn + * few applications need more than 4 channels (maybe 5 for CMYK + alpha + * mask). We recommend 10 as a reasonable compromise; use 4 if you are + * really short on memory. (Each allowed component costs a hundred or so + * bytes of storage, whether actually used in an image or not.) +@@ -118,39 +119,25 @@ typedef char JOCTET; + * They must be at least as wide as specified; but making them too big + * won't cost a huge amount of memory, so we don't provide special + * extraction code like we did for JSAMPLE. (In other words, these + * typedefs live at a different point on the speed/space tradeoff curve.) + */ + + /* UINT8 must hold at least the values 0..255. */ + +-#ifdef HAVE_UNSIGNED_CHAR +-typedef unsigned char UINT8; +-#else /* not HAVE_UNSIGNED_CHAR */ +-#ifdef __CHAR_UNSIGNED__ +-typedef char UINT8; +-#else /* not __CHAR_UNSIGNED__ */ +-typedef short UINT8; +-#endif /* __CHAR_UNSIGNED__ */ +-#endif /* HAVE_UNSIGNED_CHAR */ ++typedef uint8_t UINT8; + + /* UINT16 must hold at least the values 0..65535. */ + +-#ifdef HAVE_UNSIGNED_SHORT +-typedef unsigned short UINT16; +-#else /* not HAVE_UNSIGNED_SHORT */ +-typedef unsigned int UINT16; +-#endif /* HAVE_UNSIGNED_SHORT */ ++typedef uint16_t UINT16; + + /* INT16 must hold at least the values -32768..32767. */ + +-#ifndef XMD_H /* X11/xmd.h correctly defines INT16 */ +-typedef short INT16; +-#endif ++typedef int16_t INT16; + + /* INT32 must hold at least signed 32-bit values. + * + * NOTE: The INT32 typedef dates back to libjpeg v5 (1994.) Integers were + * sometimes 16-bit back then (MS-DOS), which is why INT32 is typedef'd to + * long. It also wasn't common (or at least as common) in 1994 for INT32 to be + * defined by platform headers. Since then, however, INT32 is defined in + * several other common places: +@@ -167,25 +154,17 @@ typedef short INT16; + * This is a recipe for conflict, since "long" and "int" aren't always + * compatible types. Since the definition of INT32 has technically been part + * of the libjpeg API for more than 20 years, we can't remove it, but we do not + * use it internally any longer. We instead define a separate type (JLONG) + * for internal use, which ensures that internal behavior will always be the + * same regardless of any external headers that may be included. + */ + +-#ifndef XMD_H /* X11/xmd.h correctly defines INT32 */ +-#ifndef _BASETSD_H_ /* Microsoft defines it in basetsd.h */ +-#ifndef _BASETSD_H /* MinGW is slightly different */ +-#ifndef QGLOBAL_H /* Qt defines it in qglobal.h */ +-typedef long INT32; +-#endif +-#endif +-#endif +-#endif ++typedef int32_t INT32; + + /* Datatype used for image dimensions. The JPEG standard only supports + * images up to 64K*64K due to 16-bit fields in SOF markers. Therefore + * "unsigned int" is sufficient on all machines. However, if you need to + * handle larger images and you don't mind deviating from the spec, you + * can change this datatype. (Note that changing this datatype will + * potentially require modifying the SIMD code. The x86-64 SIMD extensions, + * in particular, assume a 32-bit JDIMENSION.) diff --git a/media/libjpeg/simd/jccolext-altivec.c b/media/libjpeg/simd/jccolext-altivec.c new file mode 100644 index 000000000..849825eb0 --- /dev/null +++ b/media/libjpeg/simd/jccolext-altivec.c @@ -0,0 +1,267 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. + * Copyright (C) 2014, Jay Foad. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jccolor-altivec.c */ + + +void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + JSAMPROW inptr, outptr0, outptr1, outptr2; + int pitch = img_width * RGB_PIXELSIZE, num_cols; +#if __BIG_ENDIAN__ + int offset; +#endif + unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; + + __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0}, + rgbg0, rgbg1, rgbg2, rgbg3, y, cb, cr; +#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4 + __vector unsigned char rgb3 = {0}; +#endif +#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4 + __vector unsigned char rgb4 = {0}; +#endif + __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3; + __vector unsigned short yl, yh, crl, crh, cbl, cbh; + __vector int y0, y1, y2, y3, cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3; + + /* Constants */ + __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) }, + pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) }, + pw_mf016_mf033 = { __4X2(-F_0_168, -F_0_331) }, + pw_mf008_mf041 = { __4X2(-F_0_081, -F_0_418) }; + __vector unsigned short pw_f050_f000 = { __4X2(F_0_500, 0) }; + __vector int pd_onehalf = { __4X(ONE_HALF) }, + pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) }; + __vector unsigned char pb_zero = { __16X(0) }, +#if __BIG_ENDIAN__ + shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29}; +#else + shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31}; +#endif + + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr0 = output_buf[0][output_row]; + outptr1 = output_buf[1][output_row]; + outptr2 = output_buf[2][output_row]; + output_row++; + + for (num_cols = pitch; num_cols > 0; + num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16, + outptr0 += 16, outptr1 += 16, outptr2 += 16) { + +#if __BIG_ENDIAN__ + /* Load 16 pixels == 48 or 64 bytes */ + offset = (size_t)inptr & 15; + if (offset) { + __vector unsigned char unaligned_shift_index; + int bytes = num_cols + offset; + + if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) { + /* Slow path to prevent buffer overread. Since there is no way to + * read a partial AltiVec register, overread would occur on the last + * chunk of the last image row if the right edge is not on a 16-byte + * boundary. It could also occur on other rows if the bytes per row + * is low enough. Since we can't determine whether we're on the last + * image row, we have to assume every row is the last. + */ + memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); + rgb0 = vec_ld(0, tmpbuf); + rgb1 = vec_ld(16, tmpbuf); + rgb2 = vec_ld(32, tmpbuf); +#if RGB_PIXELSIZE == 4 + rgb3 = vec_ld(48, tmpbuf); +#endif + } else { + /* Fast path */ + rgb0 = vec_ld(0, inptr); + if (bytes > 16) + rgb1 = vec_ld(16, inptr); + if (bytes > 32) + rgb2 = vec_ld(32, inptr); + if (bytes > 48) + rgb3 = vec_ld(48, inptr); +#if RGB_PIXELSIZE == 4 + if (bytes > 64) + rgb4 = vec_ld(64, inptr); +#endif + unaligned_shift_index = vec_lvsl(0, inptr); + rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index); + rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index); + rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index); +#if RGB_PIXELSIZE == 4 + rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index); +#endif + } + } else { +#endif /* __BIG_ENDIAN__ */ + if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { + /* Slow path */ + memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); + rgb0 = VEC_LD(0, tmpbuf); + rgb1 = VEC_LD(16, tmpbuf); + rgb2 = VEC_LD(32, tmpbuf); +#if RGB_PIXELSIZE == 4 + rgb3 = VEC_LD(48, tmpbuf); +#endif + } else { + /* Fast path */ + rgb0 = VEC_LD(0, inptr); + if (num_cols > 16) + rgb1 = VEC_LD(16, inptr); + if (num_cols > 32) + rgb2 = VEC_LD(32, inptr); +#if RGB_PIXELSIZE == 4 + if (num_cols > 48) + rgb3 = VEC_LD(48, inptr); +#endif + } +#if __BIG_ENDIAN__ + } +#endif + +#if RGB_PIXELSIZE == 3 + /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 + * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga + * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf + * + * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 + * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 + * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb + * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf + */ + rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0); + rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1); + rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2); + rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3); +#else + /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3 + * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7 + * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb + * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf + * + * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 + * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 + * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb + * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf + */ + rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX); + rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX); + rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX); + rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX); +#endif + + /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3 + * bg0 = B0 G0 B1 G1 B2 G2 B3 G3 + * ... + * + * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't + * support unsigned vectors. + */ + rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0); + bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0); + rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1); + bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1); + rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2); + bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2); + rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3); + bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3); + + /* (Original) + * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + * + * (This implementation) + * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + */ + + /* Calculate Y values */ + + y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf); + y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf); + y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf); + y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf); + y0 = vec_msums(bg0, pw_f0114_f0250, y0); + y1 = vec_msums(bg1, pw_f0114_f0250, y1); + y2 = vec_msums(bg2, pw_f0114_f0250, y2); + y3 = vec_msums(bg3, pw_f0114_f0250, y3); + /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from + * each dword into a new 16-bit vector, which is the equivalent of + * descaling the 32-bit results (right-shifting by 16 bits) and then + * packing them. + */ + yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1, + shift_pack_index); + yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3, + shift_pack_index); + y = vec_pack(yl, yh); + vec_st(y, 0, outptr0); + + /* Calculate Cb values */ + cb0 = vec_msums(rg0, pw_mf016_mf033, pd_onehalfm1_cj); + cb1 = vec_msums(rg1, pw_mf016_mf033, pd_onehalfm1_cj); + cb2 = vec_msums(rg2, pw_mf016_mf033, pd_onehalfm1_cj); + cb3 = vec_msums(rg3, pw_mf016_mf033, pd_onehalfm1_cj); + cb0 = (__vector int)vec_msum((__vector unsigned short)bg0, pw_f050_f000, + (__vector unsigned int)cb0); + cb1 = (__vector int)vec_msum((__vector unsigned short)bg1, pw_f050_f000, + (__vector unsigned int)cb1); + cb2 = (__vector int)vec_msum((__vector unsigned short)bg2, pw_f050_f000, + (__vector unsigned int)cb2); + cb3 = (__vector int)vec_msum((__vector unsigned short)bg3, pw_f050_f000, + (__vector unsigned int)cb3); + cbl = vec_perm((__vector unsigned short)cb0, + (__vector unsigned short)cb1, shift_pack_index); + cbh = vec_perm((__vector unsigned short)cb2, + (__vector unsigned short)cb3, shift_pack_index); + cb = vec_pack(cbl, cbh); + vec_st(cb, 0, outptr1); + + /* Calculate Cr values */ + cr0 = vec_msums(bg0, pw_mf008_mf041, pd_onehalfm1_cj); + cr1 = vec_msums(bg1, pw_mf008_mf041, pd_onehalfm1_cj); + cr2 = vec_msums(bg2, pw_mf008_mf041, pd_onehalfm1_cj); + cr3 = vec_msums(bg3, pw_mf008_mf041, pd_onehalfm1_cj); + cr0 = (__vector int)vec_msum((__vector unsigned short)rg0, pw_f050_f000, + (__vector unsigned int)cr0); + cr1 = (__vector int)vec_msum((__vector unsigned short)rg1, pw_f050_f000, + (__vector unsigned int)cr1); + cr2 = (__vector int)vec_msum((__vector unsigned short)rg2, pw_f050_f000, + (__vector unsigned int)cr2); + cr3 = (__vector int)vec_msum((__vector unsigned short)rg3, pw_f050_f000, + (__vector unsigned int)cr3); + crl = vec_perm((__vector unsigned short)cr0, + (__vector unsigned short)cr1, shift_pack_index); + crh = vec_perm((__vector unsigned short)cr2, + (__vector unsigned short)cr3, shift_pack_index); + cr = vec_pack(crl, crh); + vec_st(cr, 0, outptr2); + } + } +} diff --git a/media/libjpeg/simd/jccolext-mmx.asm b/media/libjpeg/simd/jccolext-mmx.asm new file mode 100644 index 000000000..96a0372b1 --- /dev/null +++ b/media/libjpeg/simd/jccolext-mmx.asm @@ -0,0 +1,476 @@ +; +; jccolext.asm - colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_ycc_convert_mmx (JDIMENSION img_width, +; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, +; JDIMENSION output_row, int num_rows); +; + +%define img_width(b) (b)+8 ; JDIMENSION img_width +%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf +%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf +%define output_row(b) (b)+20 ; JDIMENSION output_row +%define num_rows(b) (b)+24 ; int num_rows + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 8 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr + + align 16 + global EXTN(jsimd_rgb_ycc_convert_mmx) + +EXTN(jsimd_rgb_ycc_convert_mmx): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] ; num_cols + test ecx,ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax,eax + jle near .return + alignx 16,7 +.rowloop: + pushpic eax + push edx + push ebx + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + mov ebx, JSAMPROW [ebx] ; outptr1 + mov edx, JSAMPROW [edx] ; outptr2 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_MMWORD + jae short .columnloop + alignx 16,7 + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push eax + push edx + lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + xor eax,eax + mov al, BYTE [esi+ecx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + xor edx,edx + mov dx, WORD [esi+ecx] + shl eax, WORD_BIT + or eax,edx +.column_ld4: + movd mmA,eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd mmG, DWORD [esi+ecx] + psllq mmA, DWORD_BIT + por mmA,mmG +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + movq mmG,mmA + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + mov ecx, SIZEOF_MMWORD + jmp short .rgb_ycc_cnv +.column_ld16: + test cl, 2*SIZEOF_MMWORD + mov ecx, SIZEOF_MMWORD + jz short .rgb_ycc_cnv + movq mmF,mmA + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] + jmp short .rgb_ycc_cnv + alignx 16,7 + +.columnloop: + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+2*SIZEOF_MMWORD] + +.rgb_ycc_cnv: + ; mmA=(00 10 20 01 11 21 02 12) + ; mmG=(22 03 13 23 04 14 24 05) + ; mmF=(15 25 06 16 26 07 17 27) + + movq mmD,mmA + psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01) + psrlq mmD,4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --) + + punpckhbw mmA,mmG ; mmA=(00 04 10 14 20 24 01 05) + psllq mmG,4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23) + + punpcklbw mmD,mmF ; mmD=(11 15 21 25 02 06 12 16) + punpckhbw mmG,mmF ; mmG=(22 26 03 07 13 17 23 27) + + movq mmE,mmA + psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14) + psrlq mmE,4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --) + + punpckhbw mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) + psllq mmD,4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25) + + punpcklbw mmE,mmG ; mmE=(20 22 24 26 01 03 05 07) + punpckhbw mmD,mmG ; mmD=(11 13 15 17 21 23 25 27) + + pxor mmH,mmH + + movq mmC,mmA + punpcklbw mmA,mmH ; mmA=(00 02 04 06) + punpckhbw mmC,mmH ; mmC=(10 12 14 16) + + movq mmB,mmE + punpcklbw mmE,mmH ; mmE=(20 22 24 26) + punpckhbw mmB,mmH ; mmB=(01 03 05 07) + + movq mmF,mmD + punpcklbw mmD,mmH ; mmD=(11 13 15 17) + punpckhbw mmF,mmH ; mmF=(21 23 25 27) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_MMWORD/8 + jz short .column_ld2 + sub ecx, byte SIZEOF_MMWORD/8 + movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_MMWORD/4 + jz short .column_ld4 + sub ecx, byte SIZEOF_MMWORD/4 + movq mmF,mmA + movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld4: + test cl, SIZEOF_MMWORD/2 + mov ecx, SIZEOF_MMWORD + jz short .rgb_ycc_cnv + movq mmD,mmA + movq mmC,mmF + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] + jmp short .rgb_ycc_cnv + alignx 16,7 + +.columnloop: + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] + movq mmD, MMWORD [esi+2*SIZEOF_MMWORD] + movq mmC, MMWORD [esi+3*SIZEOF_MMWORD] + +.rgb_ycc_cnv: + ; mmA=(00 10 20 30 01 11 21 31) + ; mmF=(02 12 22 32 03 13 23 33) + ; mmD=(04 14 24 34 05 15 25 35) + ; mmC=(06 16 26 36 07 17 27 37) + + movq mmB,mmA + punpcklbw mmA,mmF ; mmA=(00 02 10 12 20 22 30 32) + punpckhbw mmB,mmF ; mmB=(01 03 11 13 21 23 31 33) + + movq mmG,mmD + punpcklbw mmD,mmC ; mmD=(04 06 14 16 24 26 34 36) + punpckhbw mmG,mmC ; mmG=(05 07 15 17 25 27 35 37) + + movq mmE,mmA + punpcklwd mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) + punpckhwd mmE,mmD ; mmE=(20 22 24 26 30 32 34 36) + + movq mmH,mmB + punpcklwd mmB,mmG ; mmB=(01 03 05 07 11 13 15 17) + punpckhwd mmH,mmG ; mmH=(21 23 25 27 31 33 35 37) + + pxor mmF,mmF + + movq mmC,mmA + punpcklbw mmA,mmF ; mmA=(00 02 04 06) + punpckhbw mmC,mmF ; mmC=(10 12 14 16) + + movq mmD,mmB + punpcklbw mmB,mmF ; mmB=(01 03 05 07) + punpckhbw mmD,mmF ; mmD=(11 13 15 17) + + movq mmG,mmE + punpcklbw mmE,mmF ; mmE=(20 22 24 26) + punpckhbw mmG,mmF ; mmG=(30 32 34 36) + + punpcklbw mmF,mmH + punpckhbw mmH,mmH + psrlw mmF,BYTE_BIT ; mmF=(21 23 25 27) + psrlw mmH,BYTE_BIT ; mmH=(31 33 35 37) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE + ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + + movq MMWORD [wk(0)], mm0 ; wk(0)=RE + movq MMWORD [wk(1)], mm1 ; wk(1)=RO + movq MMWORD [wk(2)], mm4 ; wk(2)=BE + movq MMWORD [wk(3)], mm5 ; wk(3)=BO + + movq mm6,mm1 + punpcklwd mm1,mm3 + punpckhwd mm6,mm3 + movq mm7,mm1 + movq mm4,mm6 + pmaddwd mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337) + pmaddwd mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) + pmaddwd mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) + + movq MMWORD [wk(4)], mm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) + movq MMWORD [wk(5)], mm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) + + pxor mm1,mm1 + pxor mm6,mm6 + punpcklwd mm1,mm5 ; mm1=BOL + punpckhwd mm6,mm5 ; mm6=BOH + psrld mm1,1 ; mm1=BOL*FIX(0.500) + psrld mm6,1 ; mm6=BOH*FIX(0.500) + + movq mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ] + + paddd mm7,mm1 + paddd mm4,mm6 + paddd mm7,mm5 + paddd mm4,mm5 + psrld mm7,SCALEBITS ; mm7=CbOL + psrld mm4,SCALEBITS ; mm4=CbOH + packssdw mm7,mm4 ; mm7=CbO + + movq mm1, MMWORD [wk(2)] ; mm1=BE + + movq mm6,mm0 + punpcklwd mm0,mm2 + punpckhwd mm6,mm2 + movq mm5,mm0 + movq mm4,mm6 + pmaddwd mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337) + pmaddwd mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331) + pmaddwd mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331) + + movq MMWORD [wk(6)], mm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) + movq MMWORD [wk(7)], mm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) + + pxor mm0,mm0 + pxor mm6,mm6 + punpcklwd mm0,mm1 ; mm0=BEL + punpckhwd mm6,mm1 ; mm6=BEH + psrld mm0,1 ; mm0=BEL*FIX(0.500) + psrld mm6,1 ; mm6=BEH*FIX(0.500) + + movq mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ] + + paddd mm5,mm0 + paddd mm4,mm6 + paddd mm5,mm1 + paddd mm4,mm1 + psrld mm5,SCALEBITS ; mm5=CbEL + psrld mm4,SCALEBITS ; mm4=CbEH + packssdw mm5,mm4 ; mm5=CbE + + psllw mm7,BYTE_BIT + por mm5,mm7 ; mm5=Cb + movq MMWORD [ebx], mm5 ; Save Cb + + movq mm0, MMWORD [wk(3)] ; mm0=BO + movq mm6, MMWORD [wk(2)] ; mm6=BE + movq mm1, MMWORD [wk(1)] ; mm1=RO + + movq mm4,mm0 + punpcklwd mm0,mm3 + punpckhwd mm4,mm3 + movq mm7,mm0 + movq mm5,mm4 + pmaddwd mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250) + pmaddwd mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) + pmaddwd mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) + + movq mm3,[GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF] + + paddd mm0, MMWORD [wk(4)] + paddd mm4, MMWORD [wk(5)] + paddd mm0,mm3 + paddd mm4,mm3 + psrld mm0,SCALEBITS ; mm0=YOL + psrld mm4,SCALEBITS ; mm4=YOH + packssdw mm0,mm4 ; mm0=YO + + pxor mm3,mm3 + pxor mm4,mm4 + punpcklwd mm3,mm1 ; mm3=ROL + punpckhwd mm4,mm1 ; mm4=ROH + psrld mm3,1 ; mm3=ROL*FIX(0.500) + psrld mm4,1 ; mm4=ROH*FIX(0.500) + + movq mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ] + + paddd mm7,mm3 + paddd mm5,mm4 + paddd mm7,mm1 + paddd mm5,mm1 + psrld mm7,SCALEBITS ; mm7=CrOL + psrld mm5,SCALEBITS ; mm5=CrOH + packssdw mm7,mm5 ; mm7=CrO + + movq mm3, MMWORD [wk(0)] ; mm3=RE + + movq mm4,mm6 + punpcklwd mm6,mm2 + punpckhwd mm4,mm2 + movq mm1,mm6 + movq mm5,mm4 + pmaddwd mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250) + pmaddwd mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) + pmaddwd mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) + + movq mm2,[GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF] + + paddd mm6, MMWORD [wk(6)] + paddd mm4, MMWORD [wk(7)] + paddd mm6,mm2 + paddd mm4,mm2 + psrld mm6,SCALEBITS ; mm6=YEL + psrld mm4,SCALEBITS ; mm4=YEH + packssdw mm6,mm4 ; mm6=YE + + psllw mm0,BYTE_BIT + por mm6,mm0 ; mm6=Y + movq MMWORD [edi], mm6 ; Save Y + + pxor mm2,mm2 + pxor mm4,mm4 + punpcklwd mm2,mm3 ; mm2=REL + punpckhwd mm4,mm3 ; mm4=REH + psrld mm2,1 ; mm2=REL*FIX(0.500) + psrld mm4,1 ; mm4=REH*FIX(0.500) + + movq mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ] + + paddd mm1,mm2 + paddd mm5,mm4 + paddd mm1,mm0 + paddd mm5,mm0 + psrld mm1,SCALEBITS ; mm1=CrEL + psrld mm5,SCALEBITS ; mm5=CrEH + packssdw mm1,mm5 ; mm1=CrE + + psllw mm7,BYTE_BIT + por mm1,mm7 ; mm1=Cr + movq MMWORD [edx], mm1 ; Save Cr + + sub ecx, byte SIZEOF_MMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr + add edi, byte SIZEOF_MMWORD ; outptr0 + add ebx, byte SIZEOF_MMWORD ; outptr1 + add edx, byte SIZEOF_MMWORD ; outptr2 + cmp ecx, byte SIZEOF_MMWORD + jae near .columnloop + test ecx,ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + pop ebx + pop edx + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jccolext-sse2-64.asm b/media/libjpeg/simd/jccolext-sse2-64.asm new file mode 100644 index 000000000..8e4642d3b --- /dev/null +++ b/media/libjpeg/simd/jccolext-sse2-64.asm @@ -0,0 +1,486 @@ +; +; jccolext.asm - colorspace conversion (64-bit SSE2) +; +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width, +; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, +; JDIMENSION output_row, int num_rows); +; + +; r10 = JDIMENSION img_width +; r11 = JSAMPARRAY input_buf +; r12 = JSAMPIMAGE output_buf +; r13 = JDIMENSION output_row +; r14 = int num_rows + +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 8 + + align 16 + + global EXTN(jsimd_rgb_ycc_convert_sse2) + +EXTN(jsimd_rgb_ycc_convert_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + push rbx + + mov ecx, r10d + test rcx,rcx + jz near .return + + push rcx + + mov rsi, r12 + mov ecx, r13d + mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY] + lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] + lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] + lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rsi, r11 + mov eax, r14d + test rax,rax + jle near .return +.rowloop: + push rdx + push rbx + push rdi + push rsi + push rcx ; col + + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr0 + mov rbx, JSAMPROW [rbx] ; outptr1 + mov rdx, JSAMPROW [rdx] ; outptr2 + + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push rax + push rdx + lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub rcx, byte SIZEOF_BYTE + movzx rax, BYTE [rsi+rcx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub rcx, byte SIZEOF_WORD + movzx rdx, WORD [rsi+rcx] + shl rax, WORD_BIT + or rax,rdx +.column_ld4: + movd xmmA,eax + pop rdx + pop rax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub rcx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [rsi+rcx] + pslldq xmmA, SIZEOF_DWORD + por xmmA,xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub rcx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [rsi+rcx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF,xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + mov rcx, SIZEOF_XMMWORD + jmp short .rgb_ycc_cnv +.column_ld32: + test cl, 2*SIZEOF_XMMWORD + mov rcx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmB,xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv + +.columnloop: + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] + +.rgb_ycc_cnv: + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + movdqa xmmG,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + + punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + + movdqa xmmD,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + + punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + + movdqa xmmE,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + + punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + + pxor xmmH,xmmH + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmB,xmmE + punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + + movdqa xmmF,xmmD + punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub rcx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub rcx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmE +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub rcx, byte SIZEOF_XMMWORD/4 + movdqa xmmE,xmmA + movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + mov rcx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmF,xmmA + movdqa xmmH,xmmE + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv + +.columnloop: + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] + +.rgb_ycc_cnv: + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC,xmmF + punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB,xmmA + punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG,xmmD + punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE,xmmA + punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH,xmmB + punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF,xmmF + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD,xmmB + punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG,xmmE + punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF,xmmH + punpckhbw xmmH,xmmH + psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE + movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO + + movdqa xmm6,xmm1 + punpcklwd xmm1,xmm3 + punpckhwd xmm6,xmm3 + movdqa xmm7,xmm1 + movdqa xmm4,xmm6 + pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + pmaddwd xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) + pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) + + movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) + movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) + + pxor xmm1,xmm1 + pxor xmm6,xmm6 + punpcklwd xmm1,xmm5 ; xmm1=BOL + punpckhwd xmm6,xmm5 ; xmm6=BOH + psrld xmm1,1 ; xmm1=BOL*FIX(0.500) + psrld xmm6,1 ; xmm6=BOH*FIX(0.500) + + movdqa xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ] + + paddd xmm7,xmm1 + paddd xmm4,xmm6 + paddd xmm7,xmm5 + paddd xmm4,xmm5 + psrld xmm7,SCALEBITS ; xmm7=CbOL + psrld xmm4,SCALEBITS ; xmm4=CbOH + packssdw xmm7,xmm4 ; xmm7=CbO + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE + + movdqa xmm6,xmm0 + punpcklwd xmm0,xmm2 + punpckhwd xmm6,xmm2 + movdqa xmm5,xmm0 + movdqa xmm4,xmm6 + pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + pmaddwd xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) + pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) + + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) + + pxor xmm0,xmm0 + pxor xmm6,xmm6 + punpcklwd xmm0,xmm1 ; xmm0=BEL + punpckhwd xmm6,xmm1 ; xmm6=BEH + psrld xmm0,1 ; xmm0=BEL*FIX(0.500) + psrld xmm6,1 ; xmm6=BEH*FIX(0.500) + + movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm5,xmm0 + paddd xmm4,xmm6 + paddd xmm5,xmm1 + paddd xmm4,xmm1 + psrld xmm5,SCALEBITS ; xmm5=CbEL + psrld xmm4,SCALEBITS ; xmm4=CbEH + packssdw xmm5,xmm4 ; xmm5=CbE + + psllw xmm7,BYTE_BIT + por xmm5,xmm7 ; xmm5=Cb + movdqa XMMWORD [rbx], xmm5 ; Save Cb + + movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO + + movdqa xmm4,xmm0 + punpcklwd xmm0,xmm3 + punpckhwd xmm4,xmm3 + movdqa xmm7,xmm0 + movdqa xmm5,xmm4 + pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + pmaddwd xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) + pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) + + movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] + + paddd xmm0, XMMWORD [wk(4)] + paddd xmm4, XMMWORD [wk(5)] + paddd xmm0,xmm3 + paddd xmm4,xmm3 + psrld xmm0,SCALEBITS ; xmm0=YOL + psrld xmm4,SCALEBITS ; xmm4=YOH + packssdw xmm0,xmm4 ; xmm0=YO + + pxor xmm3,xmm3 + pxor xmm4,xmm4 + punpcklwd xmm3,xmm1 ; xmm3=ROL + punpckhwd xmm4,xmm1 ; xmm4=ROH + psrld xmm3,1 ; xmm3=ROL*FIX(0.500) + psrld xmm4,1 ; xmm4=ROH*FIX(0.500) + + movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm7,xmm3 + paddd xmm5,xmm4 + paddd xmm7,xmm1 + paddd xmm5,xmm1 + psrld xmm7,SCALEBITS ; xmm7=CrOL + psrld xmm5,SCALEBITS ; xmm5=CrOH + packssdw xmm7,xmm5 ; xmm7=CrO + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE + + movdqa xmm4,xmm6 + punpcklwd xmm6,xmm2 + punpckhwd xmm4,xmm2 + movdqa xmm1,xmm6 + movdqa xmm5,xmm4 + pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + pmaddwd xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) + pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) + + movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(6)] + paddd xmm4, XMMWORD [wk(7)] + paddd xmm6,xmm2 + paddd xmm4,xmm2 + psrld xmm6,SCALEBITS ; xmm6=YEL + psrld xmm4,SCALEBITS ; xmm4=YEH + packssdw xmm6,xmm4 ; xmm6=YE + + psllw xmm0,BYTE_BIT + por xmm6,xmm0 ; xmm6=Y + movdqa XMMWORD [rdi], xmm6 ; Save Y + + pxor xmm2,xmm2 + pxor xmm4,xmm4 + punpcklwd xmm2,xmm3 ; xmm2=REL + punpckhwd xmm4,xmm3 ; xmm4=REH + psrld xmm2,1 ; xmm2=REL*FIX(0.500) + psrld xmm4,1 ; xmm4=REH*FIX(0.500) + + movdqa xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ] + + paddd xmm1,xmm2 + paddd xmm5,xmm4 + paddd xmm1,xmm0 + paddd xmm5,xmm0 + psrld xmm1,SCALEBITS ; xmm1=CrEL + psrld xmm5,SCALEBITS ; xmm5=CrEH + packssdw xmm1,xmm5 ; xmm1=CrE + + psllw xmm7,BYTE_BIT + por xmm1,xmm7 ; xmm1=Cr + movdqa XMMWORD [rdx], xmm1 ; Save Cr + + sub rcx, byte SIZEOF_XMMWORD + add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add rdi, byte SIZEOF_XMMWORD ; outptr0 + add rbx, byte SIZEOF_XMMWORD ; outptr1 + add rdx, byte SIZEOF_XMMWORD ; outptr2 + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + test rcx,rcx + jnz near .column_ld1 + + pop rcx ; col + pop rsi + pop rdi + pop rbx + pop rdx + + add rsi, byte SIZEOF_JSAMPROW ; input_buf + add rdi, byte SIZEOF_JSAMPROW + add rbx, byte SIZEOF_JSAMPROW + add rdx, byte SIZEOF_JSAMPROW + dec rax ; num_rows + jg near .rowloop + +.return: + pop rbx + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jccolext-sse2.asm b/media/libjpeg/simd/jccolext-sse2.asm new file mode 100644 index 000000000..cc38e98a1 --- /dev/null +++ b/media/libjpeg/simd/jccolext-sse2.asm @@ -0,0 +1,503 @@ +; +; jccolext.asm - colorspace conversion (SSE2) +; +; x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width, +; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, +; JDIMENSION output_row, int num_rows); +; + +%define img_width(b) (b)+8 ; JDIMENSION img_width +%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf +%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf +%define output_row(b) (b)+20 ; JDIMENSION output_row +%define num_rows(b) (b)+24 ; int num_rows + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 8 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr + + align 16 + + global EXTN(jsimd_rgb_ycc_convert_sse2) + +EXTN(jsimd_rgb_ycc_convert_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] + test ecx,ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax,eax + jle near .return + alignx 16,7 +.rowloop: + pushpic eax + push edx + push ebx + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + mov ebx, JSAMPROW [ebx] ; outptr1 + mov edx, JSAMPROW [edx] ; outptr2 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + alignx 16,7 + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push eax + push edx + lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + movzx eax, BYTE [esi+ecx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + movzx edx, WORD [esi+ecx] + shl eax, WORD_BIT + or eax,edx +.column_ld4: + movd xmmA,eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [esi+ecx] + pslldq xmmA, SIZEOF_DWORD + por xmmA,xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub ecx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [esi+ecx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF,xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + mov ecx, SIZEOF_XMMWORD + jmp short .rgb_ycc_cnv +.column_ld32: + test cl, 2*SIZEOF_XMMWORD + mov ecx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmB,xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv + alignx 16,7 + +.columnloop: + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] + +.rgb_ycc_cnv: + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + movdqa xmmG,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + + punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + + movdqa xmmD,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + + punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + + movdqa xmmE,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + + punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + + pxor xmmH,xmmH + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmB,xmmE + punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + + movdqa xmmF,xmmD + punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub ecx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub ecx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmE +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub ecx, byte SIZEOF_XMMWORD/4 + movdqa xmmE,xmmA + movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + mov ecx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmF,xmmA + movdqa xmmH,xmmE + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv + alignx 16,7 + +.columnloop: + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] + +.rgb_ycc_cnv: + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC,xmmF + punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB,xmmA + punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG,xmmD + punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE,xmmA + punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH,xmmB + punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF,xmmF + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD,xmmB + punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG,xmmE + punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF,xmmH + punpckhbw xmmH,xmmH + psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE + movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO + + movdqa xmm6,xmm1 + punpcklwd xmm1,xmm3 + punpckhwd xmm6,xmm3 + movdqa xmm7,xmm1 + movdqa xmm4,xmm6 + pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + pmaddwd xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) + pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) + + movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) + movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) + + pxor xmm1,xmm1 + pxor xmm6,xmm6 + punpcklwd xmm1,xmm5 ; xmm1=BOL + punpckhwd xmm6,xmm5 ; xmm6=BOH + psrld xmm1,1 ; xmm1=BOL*FIX(0.500) + psrld xmm6,1 ; xmm6=BOH*FIX(0.500) + + movdqa xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ] + + paddd xmm7,xmm1 + paddd xmm4,xmm6 + paddd xmm7,xmm5 + paddd xmm4,xmm5 + psrld xmm7,SCALEBITS ; xmm7=CbOL + psrld xmm4,SCALEBITS ; xmm4=CbOH + packssdw xmm7,xmm4 ; xmm7=CbO + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE + + movdqa xmm6,xmm0 + punpcklwd xmm0,xmm2 + punpckhwd xmm6,xmm2 + movdqa xmm5,xmm0 + movdqa xmm4,xmm6 + pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + pmaddwd xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) + pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) + + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) + + pxor xmm0,xmm0 + pxor xmm6,xmm6 + punpcklwd xmm0,xmm1 ; xmm0=BEL + punpckhwd xmm6,xmm1 ; xmm6=BEH + psrld xmm0,1 ; xmm0=BEL*FIX(0.500) + psrld xmm6,1 ; xmm6=BEH*FIX(0.500) + + movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm5,xmm0 + paddd xmm4,xmm6 + paddd xmm5,xmm1 + paddd xmm4,xmm1 + psrld xmm5,SCALEBITS ; xmm5=CbEL + psrld xmm4,SCALEBITS ; xmm4=CbEH + packssdw xmm5,xmm4 ; xmm5=CbE + + psllw xmm7,BYTE_BIT + por xmm5,xmm7 ; xmm5=Cb + movdqa XMMWORD [ebx], xmm5 ; Save Cb + + movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO + + movdqa xmm4,xmm0 + punpcklwd xmm0,xmm3 + punpckhwd xmm4,xmm3 + movdqa xmm7,xmm0 + movdqa xmm5,xmm4 + pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + pmaddwd xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) + pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) + + movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] + + paddd xmm0, XMMWORD [wk(4)] + paddd xmm4, XMMWORD [wk(5)] + paddd xmm0,xmm3 + paddd xmm4,xmm3 + psrld xmm0,SCALEBITS ; xmm0=YOL + psrld xmm4,SCALEBITS ; xmm4=YOH + packssdw xmm0,xmm4 ; xmm0=YO + + pxor xmm3,xmm3 + pxor xmm4,xmm4 + punpcklwd xmm3,xmm1 ; xmm3=ROL + punpckhwd xmm4,xmm1 ; xmm4=ROH + psrld xmm3,1 ; xmm3=ROL*FIX(0.500) + psrld xmm4,1 ; xmm4=ROH*FIX(0.500) + + movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm7,xmm3 + paddd xmm5,xmm4 + paddd xmm7,xmm1 + paddd xmm5,xmm1 + psrld xmm7,SCALEBITS ; xmm7=CrOL + psrld xmm5,SCALEBITS ; xmm5=CrOH + packssdw xmm7,xmm5 ; xmm7=CrO + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE + + movdqa xmm4,xmm6 + punpcklwd xmm6,xmm2 + punpckhwd xmm4,xmm2 + movdqa xmm1,xmm6 + movdqa xmm5,xmm4 + pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + pmaddwd xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) + pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) + + movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(6)] + paddd xmm4, XMMWORD [wk(7)] + paddd xmm6,xmm2 + paddd xmm4,xmm2 + psrld xmm6,SCALEBITS ; xmm6=YEL + psrld xmm4,SCALEBITS ; xmm4=YEH + packssdw xmm6,xmm4 ; xmm6=YE + + psllw xmm0,BYTE_BIT + por xmm6,xmm0 ; xmm6=Y + movdqa XMMWORD [edi], xmm6 ; Save Y + + pxor xmm2,xmm2 + pxor xmm4,xmm4 + punpcklwd xmm2,xmm3 ; xmm2=REL + punpckhwd xmm4,xmm3 ; xmm4=REH + psrld xmm2,1 ; xmm2=REL*FIX(0.500) + psrld xmm4,1 ; xmm4=REH*FIX(0.500) + + movdqa xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ] + + paddd xmm1,xmm2 + paddd xmm5,xmm4 + paddd xmm1,xmm0 + paddd xmm5,xmm0 + psrld xmm1,SCALEBITS ; xmm1=CrEL + psrld xmm5,SCALEBITS ; xmm5=CrEH + packssdw xmm1,xmm5 ; xmm1=CrE + + psllw xmm7,BYTE_BIT + por xmm1,xmm7 ; xmm1=Cr + movdqa XMMWORD [edx], xmm1 ; Save Cr + + sub ecx, byte SIZEOF_XMMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add edi, byte SIZEOF_XMMWORD ; outptr0 + add ebx, byte SIZEOF_XMMWORD ; outptr1 + add edx, byte SIZEOF_XMMWORD ; outptr2 + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + test ecx,ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + pop ebx + pop edx + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jccolor-altivec.c b/media/libjpeg/simd/jccolor-altivec.c new file mode 100644 index 000000000..ec473320e --- /dev/null +++ b/media/libjpeg/simd/jccolor-altivec.c @@ -0,0 +1,104 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* RGB --> YCC CONVERSION */ + +#include "jsimd_altivec.h" + + +#define F_0_081 5329 /* FIX(0.08131) */ +#define F_0_114 7471 /* FIX(0.11400) */ +#define F_0_168 11059 /* FIX(0.16874) */ +#define F_0_250 16384 /* FIX(0.25000) */ +#define F_0_299 19595 /* FIX(0.29900) */ +#define F_0_331 21709 /* FIX(0.33126) */ +#define F_0_418 27439 /* FIX(0.41869) */ +#define F_0_500 32768 /* FIX(0.50000) */ +#define F_0_587 38470 /* FIX(0.58700) */ +#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */ + +#define SCALEBITS 16 +#define ONE_HALF (1 << (SCALEBITS - 1)) + + +#define RGBG_INDEX0 {0,1,3,4,6,7,9,10,2,1,5,4,8,7,11,10} +#define RGBG_INDEX1 {12,13,15,16,18,19,21,22,14,13,17,16,20,19,23,22} +#define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18} +#define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14} +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE + +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_rgb_ycc_convert_altivec jsimd_extrgb_ycc_convert_altivec +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX0 +#undef RGBG_INDEX1 +#undef RGBG_INDEX2 +#undef RGBG_INDEX3 +#undef jsimd_rgb_ycc_convert_altivec + +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13} +#define jsimd_rgb_ycc_convert_altivec jsimd_extrgbx_ycc_convert_altivec +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_ycc_convert_altivec + +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10} +#define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22} +#define RGBG_INDEX2 {10,9,13,12,16,15,19,18,8,9,11,12,14,15,17,18} +#define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14} +#define jsimd_rgb_ycc_convert_altivec jsimd_extbgr_ycc_convert_altivec +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX0 +#undef RGBG_INDEX1 +#undef RGBG_INDEX2 +#undef RGBG_INDEX3 +#undef jsimd_rgb_ycc_convert_altivec + +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13} +#define jsimd_rgb_ycc_convert_altivec jsimd_extbgrx_ycc_convert_altivec +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_ycc_convert_altivec + +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14} +#define jsimd_rgb_ycc_convert_altivec jsimd_extxbgr_ycc_convert_altivec +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_ycc_convert_altivec + +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14} +#define jsimd_rgb_ycc_convert_altivec jsimd_extxrgb_ycc_convert_altivec +#include "jccolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_ycc_convert_altivec diff --git a/media/libjpeg/simd/jccolor-mmx.asm b/media/libjpeg/simd/jccolor-mmx.asm new file mode 100644 index 000000000..c4e6d88be --- /dev/null +++ b/media/libjpeg/simd/jccolor-mmx.asm @@ -0,0 +1,122 @@ +; +; jccolor.asm - colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_081 equ 5329 ; FIX(0.08131) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_168 equ 11059 ; FIX(0.16874) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_331 equ 21709 ; FIX(0.33126) +F_0_418 equ 27439 ; FIX(0.41869) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_rgb_ycc_convert_mmx) + +EXTN(jconst_rgb_ycc_convert_mmx): + +PW_F0299_F0337 times 2 dw F_0_299, F_0_337 +PW_F0114_F0250 times 2 dw F_0_114, F_0_250 +PW_MF016_MF033 times 2 dw -F_0_168,-F_0_331 +PW_MF008_MF041 times 2 dw -F_0_081,-F_0_418 +PD_ONEHALFM1_CJ times 2 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS) +PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jccolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_mmx jsimd_extrgb_ycc_convert_mmx +%include "jccolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_ycc_convert_mmx jsimd_extrgbx_ycc_convert_mmx +%include "jccolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_mmx jsimd_extbgr_ycc_convert_mmx +%include "jccolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_ycc_convert_mmx jsimd_extbgrx_ycc_convert_mmx +%include "jccolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_mmx jsimd_extxbgr_ycc_convert_mmx +%include "jccolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_mmx jsimd_extxrgb_ycc_convert_mmx +%include "jccolext-mmx.asm" diff --git a/media/libjpeg/simd/jccolor-sse2-64.asm b/media/libjpeg/simd/jccolor-sse2-64.asm new file mode 100644 index 000000000..bd2188b4c --- /dev/null +++ b/media/libjpeg/simd/jccolor-sse2-64.asm @@ -0,0 +1,121 @@ +; +; jccolor.asm - colorspace conversion (64-bit SSE2) +; +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_081 equ 5329 ; FIX(0.08131) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_168 equ 11059 ; FIX(0.16874) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_331 equ 21709 ; FIX(0.33126) +F_0_418 equ 27439 ; FIX(0.41869) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_rgb_ycc_convert_sse2) + +EXTN(jconst_rgb_ycc_convert_sse2): + +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PW_MF016_MF033 times 4 dw -F_0_168,-F_0_331 +PW_MF008_MF041 times 4 dw -F_0_081,-F_0_418 +PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS) +PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jccolext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2 +%include "jccolext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2 +%include "jccolext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2 +%include "jccolext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2 +%include "jccolext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2 +%include "jccolext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2 +%include "jccolext-sse2-64.asm" diff --git a/media/libjpeg/simd/jccolor-sse2.asm b/media/libjpeg/simd/jccolor-sse2.asm new file mode 100644 index 000000000..13124d13d --- /dev/null +++ b/media/libjpeg/simd/jccolor-sse2.asm @@ -0,0 +1,121 @@ +; +; jccolor.asm - colorspace conversion (SSE2) +; +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_081 equ 5329 ; FIX(0.08131) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_168 equ 11059 ; FIX(0.16874) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_331 equ 21709 ; FIX(0.33126) +F_0_418 equ 27439 ; FIX(0.41869) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_rgb_ycc_convert_sse2) + +EXTN(jconst_rgb_ycc_convert_sse2): + +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PW_MF016_MF033 times 4 dw -F_0_168,-F_0_331 +PW_MF008_MF041 times 4 dw -F_0_081,-F_0_418 +PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS) +PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2 +%include "jccolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2 +%include "jccolext-sse2.asm" diff --git a/media/libjpeg/simd/jcgray-altivec.c b/media/libjpeg/simd/jcgray-altivec.c new file mode 100644 index 000000000..684df5ef1 --- /dev/null +++ b/media/libjpeg/simd/jcgray-altivec.c @@ -0,0 +1,99 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* RGB --> GRAYSCALE CONVERSION */ + +#include "jsimd_altivec.h" + + +#define F_0_114 7471 /* FIX(0.11400) */ +#define F_0_250 16384 /* FIX(0.25000) */ +#define F_0_299 19595 /* FIX(0.29900) */ +#define F_0_587 38470 /* FIX(0.58700) */ +#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */ + +#define SCALEBITS 16 +#define ONE_HALF (1 << (SCALEBITS - 1)) + + +#define RGBG_INDEX0 {0,1,3,4,6,7,9,10,2,1,5,4,8,7,11,10} +#define RGBG_INDEX1 {12,13,15,16,18,19,21,22,14,13,17,16,20,19,23,22} +#define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18} +#define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14} +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE + +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_rgb_gray_convert_altivec jsimd_extrgb_gray_convert_altivec +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX0 +#undef RGBG_INDEX1 +#undef RGBG_INDEX2 +#undef RGBG_INDEX3 +#undef jsimd_rgb_gray_convert_altivec + +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13} +#define jsimd_rgb_gray_convert_altivec jsimd_extrgbx_gray_convert_altivec +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_gray_convert_altivec + +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10} +#define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22} +#define RGBG_INDEX2 {10,9,13,12,16,15,19,18,8,9,11,12,14,15,17,18} +#define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14} +#define jsimd_rgb_gray_convert_altivec jsimd_extbgr_gray_convert_altivec +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX0 +#undef RGBG_INDEX1 +#undef RGBG_INDEX2 +#undef RGBG_INDEX3 +#undef jsimd_rgb_gray_convert_altivec + +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13} +#define jsimd_rgb_gray_convert_altivec jsimd_extbgrx_gray_convert_altivec +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_gray_convert_altivec + +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14} +#define jsimd_rgb_gray_convert_altivec jsimd_extxbgr_gray_convert_altivec +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_gray_convert_altivec + +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14} +#define jsimd_rgb_gray_convert_altivec jsimd_extxrgb_gray_convert_altivec +#include "jcgryext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGBG_INDEX +#undef jsimd_rgb_gray_convert_altivec diff --git a/media/libjpeg/simd/jcgray-mmx.asm b/media/libjpeg/simd/jcgray-mmx.asm new file mode 100644 index 000000000..0819b6ca0 --- /dev/null +++ b/media/libjpeg/simd/jcgray-mmx.asm @@ -0,0 +1,115 @@ +; +; jcgray.asm - grayscale colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2011, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_rgb_gray_convert_mmx) + +EXTN(jconst_rgb_gray_convert_mmx): + +PW_F0299_F0337 times 2 dw F_0_299, F_0_337 +PW_F0114_F0250 times 2 dw F_0_114, F_0_250 +PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jcgryext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_gray_convert_mmx jsimd_extrgb_gray_convert_mmx +%include "jcgryext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_gray_convert_mmx jsimd_extrgbx_gray_convert_mmx +%include "jcgryext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_gray_convert_mmx jsimd_extbgr_gray_convert_mmx +%include "jcgryext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_gray_convert_mmx jsimd_extbgrx_gray_convert_mmx +%include "jcgryext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_gray_convert_mmx jsimd_extxbgr_gray_convert_mmx +%include "jcgryext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_gray_convert_mmx jsimd_extxrgb_gray_convert_mmx +%include "jcgryext-mmx.asm" diff --git a/media/libjpeg/simd/jcgray-sse2-64.asm b/media/libjpeg/simd/jcgray-sse2-64.asm new file mode 100644 index 000000000..bafd302aa --- /dev/null +++ b/media/libjpeg/simd/jcgray-sse2-64.asm @@ -0,0 +1,114 @@ +; +; jcgray.asm - grayscale colorspace conversion (64-bit SSE2) +; +; Copyright (C) 2011, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_rgb_gray_convert_sse2) + +EXTN(jconst_rgb_gray_convert_sse2): + +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jcgryext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2 +%include "jcgryext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2 +%include "jcgryext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2 +%include "jcgryext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2 +%include "jcgryext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2 +%include "jcgryext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2 +%include "jcgryext-sse2-64.asm" diff --git a/media/libjpeg/simd/jcgray-sse2.asm b/media/libjpeg/simd/jcgray-sse2.asm new file mode 100644 index 000000000..5b0b46695 --- /dev/null +++ b/media/libjpeg/simd/jcgray-sse2.asm @@ -0,0 +1,114 @@ +; +; jcgray.asm - grayscale colorspace conversion (SSE2) +; +; Copyright (C) 2011, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_rgb_gray_convert_sse2) + +EXTN(jconst_rgb_gray_convert_sse2): + +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2 +%include "jcgryext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2 +%include "jcgryext-sse2.asm" diff --git a/media/libjpeg/simd/jcgryext-altivec.c b/media/libjpeg/simd/jcgryext-altivec.c new file mode 100644 index 000000000..7f8232bb2 --- /dev/null +++ b/media/libjpeg/simd/jcgryext-altivec.c @@ -0,0 +1,227 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. + * Copyright (C) 2014, Jay Foad. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jcgray-altivec.c */ + + +void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width, + JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + JSAMPROW inptr, outptr; + int pitch = img_width * RGB_PIXELSIZE, num_cols; +#if __BIG_ENDIAN__ + int offset; + unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; +#endif + + __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0}, + rgbg0, rgbg1, rgbg2, rgbg3, y; +#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4 + __vector unsigned char rgb3 = {0}; +#endif +#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4 + __vector unsigned char rgb4 = {0}; +#endif + __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3; + __vector unsigned short yl, yh; + __vector int y0, y1, y2, y3; + + /* Constants */ + __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) }, + pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) }; + __vector int pd_onehalf = { __4X(ONE_HALF) }; + __vector unsigned char pb_zero = { __16X(0) }, +#if __BIG_ENDIAN__ + shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29}; +#else + shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31}; +#endif + + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr = output_buf[0][output_row]; + output_row++; + + for (num_cols = pitch; num_cols > 0; + num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16, + outptr += 16) { + +#if __BIG_ENDIAN__ + /* Load 16 pixels == 48 or 64 bytes */ + offset = (size_t)inptr & 15; + if (offset) { + __vector unsigned char unaligned_shift_index; + int bytes = num_cols + offset; + + if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) { + /* Slow path to prevent buffer overread. Since there is no way to + * read a partial AltiVec register, overread would occur on the last + * chunk of the last image row if the right edge is not on a 16-byte + * boundary. It could also occur on other rows if the bytes per row + * is low enough. Since we can't determine whether we're on the last + * image row, we have to assume every row is the last. + */ + memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); + rgb0 = vec_ld(0, tmpbuf); + rgb1 = vec_ld(16, tmpbuf); + rgb2 = vec_ld(32, tmpbuf); +#if RGB_PIXELSIZE == 4 + rgb3 = vec_ld(48, tmpbuf); +#endif + } else { + /* Fast path */ + rgb0 = vec_ld(0, inptr); + if (bytes > 16) + rgb1 = vec_ld(16, inptr); + if (bytes > 32) + rgb2 = vec_ld(32, inptr); + if (bytes > 48) + rgb3 = vec_ld(48, inptr); +#if RGB_PIXELSIZE == 4 + if (bytes > 64) + rgb4 = vec_ld(64, inptr); +#endif + unaligned_shift_index = vec_lvsl(0, inptr); + rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index); + rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index); + rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index); +#if RGB_PIXELSIZE == 4 + rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index); +#endif + } + } else { + if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { + /* Slow path */ + memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); + rgb0 = vec_ld(0, tmpbuf); + rgb1 = vec_ld(16, tmpbuf); + rgb2 = vec_ld(32, tmpbuf); +#if RGB_PIXELSIZE == 4 + rgb3 = vec_ld(48, tmpbuf); +#endif + } else { + /* Fast path */ + rgb0 = vec_ld(0, inptr); + if (num_cols > 16) + rgb1 = vec_ld(16, inptr); + if (num_cols > 32) + rgb2 = vec_ld(32, inptr); +#if RGB_PIXELSIZE == 4 + if (num_cols > 48) + rgb3 = vec_ld(48, inptr); +#endif + } + } +#else + /* Little endian */ + rgb0 = vec_vsx_ld(0, inptr); + if (num_cols > 16) + rgb1 = vec_vsx_ld(16, inptr); + if (num_cols > 32) + rgb2 = vec_vsx_ld(32, inptr); +#if RGB_PIXELSIZE == 4 + if (num_cols > 48) + rgb3 = vec_vsx_ld(48, inptr); +#endif +#endif + +#if RGB_PIXELSIZE == 3 + /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 + * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga + * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf + * + * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 + * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 + * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb + * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf + */ + rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0); + rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1); + rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2); + rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3); +#else + /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3 + * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7 + * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb + * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf + * + * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 + * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 + * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb + * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf + */ + rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX); + rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX); + rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX); + rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX); +#endif + + /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3 + * bg0 = B0 G0 B1 G1 B2 G2 B3 G3 + * ... + * + * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't + * support unsigned vectors. + */ + rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0); + bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0); + rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1); + bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1); + rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2); + bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2); + rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3); + bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3); + + /* (Original) + * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + * + * (This implementation) + * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + */ + + /* Calculate Y values */ + + y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf); + y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf); + y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf); + y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf); + y0 = vec_msums(bg0, pw_f0114_f0250, y0); + y1 = vec_msums(bg1, pw_f0114_f0250, y1); + y2 = vec_msums(bg2, pw_f0114_f0250, y2); + y3 = vec_msums(bg3, pw_f0114_f0250, y3); + /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from + * each dword into a new 16-bit vector, which is the equivalent of + * descaling the 32-bit results (right-shifting by 16 bits) and then + * packing them. + */ + yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1, + shift_pack_index); + yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3, + shift_pack_index); + y = vec_pack(yl, yh); + vec_st(y, 0, outptr); + } + } +} diff --git a/media/libjpeg/simd/jcgryext-mmx.asm b/media/libjpeg/simd/jcgryext-mmx.asm new file mode 100644 index 000000000..1c1b8d8bc --- /dev/null +++ b/media/libjpeg/simd/jcgryext-mmx.asm @@ -0,0 +1,356 @@ +; +; jcgryext.asm - grayscale colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2011, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_gray_convert_mmx (JDIMENSION img_width, +; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, +; JDIMENSION output_row, int num_rows); +; + +%define img_width(b) (b)+8 ; JDIMENSION img_width +%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf +%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf +%define output_row(b) (b)+20 ; JDIMENSION output_row +%define num_rows(b) (b)+24 ; int num_rows + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr + + align 16 + global EXTN(jsimd_rgb_gray_convert_mmx) + +EXTN(jsimd_rgb_gray_convert_mmx): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] ; num_cols + test ecx,ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax,eax + jle near .return + alignx 16,7 +.rowloop: + pushpic eax + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_MMWORD + jae short .columnloop + alignx 16,7 + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push eax + push edx + lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + xor eax,eax + mov al, BYTE [esi+ecx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + xor edx,edx + mov dx, WORD [esi+ecx] + shl eax, WORD_BIT + or eax,edx +.column_ld4: + movd mmA,eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd mmG, DWORD [esi+ecx] + psllq mmA, DWORD_BIT + por mmA,mmG +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + movq mmG,mmA + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + mov ecx, SIZEOF_MMWORD + jmp short .rgb_gray_cnv +.column_ld16: + test cl, 2*SIZEOF_MMWORD + mov ecx, SIZEOF_MMWORD + jz short .rgb_gray_cnv + movq mmF,mmA + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] + jmp short .rgb_gray_cnv + alignx 16,7 + +.columnloop: + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+2*SIZEOF_MMWORD] + +.rgb_gray_cnv: + ; mmA=(00 10 20 01 11 21 02 12) + ; mmG=(22 03 13 23 04 14 24 05) + ; mmF=(15 25 06 16 26 07 17 27) + + movq mmD,mmA + psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01) + psrlq mmD,4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --) + + punpckhbw mmA,mmG ; mmA=(00 04 10 14 20 24 01 05) + psllq mmG,4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23) + + punpcklbw mmD,mmF ; mmD=(11 15 21 25 02 06 12 16) + punpckhbw mmG,mmF ; mmG=(22 26 03 07 13 17 23 27) + + movq mmE,mmA + psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14) + psrlq mmE,4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --) + + punpckhbw mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) + psllq mmD,4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25) + + punpcklbw mmE,mmG ; mmE=(20 22 24 26 01 03 05 07) + punpckhbw mmD,mmG ; mmD=(11 13 15 17 21 23 25 27) + + pxor mmH,mmH + + movq mmC,mmA + punpcklbw mmA,mmH ; mmA=(00 02 04 06) + punpckhbw mmC,mmH ; mmC=(10 12 14 16) + + movq mmB,mmE + punpcklbw mmE,mmH ; mmE=(20 22 24 26) + punpckhbw mmB,mmH ; mmB=(01 03 05 07) + + movq mmF,mmD + punpcklbw mmD,mmH ; mmD=(11 13 15 17) + punpckhbw mmF,mmH ; mmF=(21 23 25 27) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_MMWORD/8 + jz short .column_ld2 + sub ecx, byte SIZEOF_MMWORD/8 + movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_MMWORD/4 + jz short .column_ld4 + sub ecx, byte SIZEOF_MMWORD/4 + movq mmF,mmA + movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld4: + test cl, SIZEOF_MMWORD/2 + mov ecx, SIZEOF_MMWORD + jz short .rgb_gray_cnv + movq mmD,mmA + movq mmC,mmF + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] + jmp short .rgb_gray_cnv + alignx 16,7 + +.columnloop: + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] + movq mmD, MMWORD [esi+2*SIZEOF_MMWORD] + movq mmC, MMWORD [esi+3*SIZEOF_MMWORD] + +.rgb_gray_cnv: + ; mmA=(00 10 20 30 01 11 21 31) + ; mmF=(02 12 22 32 03 13 23 33) + ; mmD=(04 14 24 34 05 15 25 35) + ; mmC=(06 16 26 36 07 17 27 37) + + movq mmB,mmA + punpcklbw mmA,mmF ; mmA=(00 02 10 12 20 22 30 32) + punpckhbw mmB,mmF ; mmB=(01 03 11 13 21 23 31 33) + + movq mmG,mmD + punpcklbw mmD,mmC ; mmD=(04 06 14 16 24 26 34 36) + punpckhbw mmG,mmC ; mmG=(05 07 15 17 25 27 35 37) + + movq mmE,mmA + punpcklwd mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) + punpckhwd mmE,mmD ; mmE=(20 22 24 26 30 32 34 36) + + movq mmH,mmB + punpcklwd mmB,mmG ; mmB=(01 03 05 07 11 13 15 17) + punpckhwd mmH,mmG ; mmH=(21 23 25 27 31 33 35 37) + + pxor mmF,mmF + + movq mmC,mmA + punpcklbw mmA,mmF ; mmA=(00 02 04 06) + punpckhbw mmC,mmF ; mmC=(10 12 14 16) + + movq mmD,mmB + punpcklbw mmB,mmF ; mmB=(01 03 05 07) + punpckhbw mmD,mmF ; mmD=(11 13 15 17) + + movq mmG,mmE + punpcklbw mmE,mmF ; mmE=(20 22 24 26) + punpckhbw mmG,mmF ; mmG=(30 32 34 36) + + punpcklbw mmF,mmH + punpckhbw mmH,mmH + psrlw mmF,BYTE_BIT ; mmF=(21 23 25 27) + psrlw mmH,BYTE_BIT ; mmH=(31 33 35 37) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE + ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + movq mm6,mm1 + punpcklwd mm1,mm3 + punpckhwd mm6,mm3 + pmaddwd mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + movq mm7, mm6 ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + movq mm6,mm0 + punpcklwd mm0,mm2 + punpckhwd mm6,mm2 + pmaddwd mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337) + + movq MMWORD [wk(0)], mm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + movq MMWORD [wk(1)], mm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + movq mm0, mm5 ; mm0=BO + movq mm6, mm4 ; mm6=BE + + movq mm4,mm0 + punpcklwd mm0,mm3 + punpckhwd mm4,mm3 + pmaddwd mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + movq mm3,[GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF] + + paddd mm0, mm1 + paddd mm4, mm7 + paddd mm0,mm3 + paddd mm4,mm3 + psrld mm0,SCALEBITS ; mm0=YOL + psrld mm4,SCALEBITS ; mm4=YOH + packssdw mm0,mm4 ; mm0=YO + + movq mm4,mm6 + punpcklwd mm6,mm2 + punpckhwd mm4,mm2 + pmaddwd mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + movq mm2,[GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF] + + paddd mm6, MMWORD [wk(0)] + paddd mm4, MMWORD [wk(1)] + paddd mm6,mm2 + paddd mm4,mm2 + psrld mm6,SCALEBITS ; mm6=YEL + psrld mm4,SCALEBITS ; mm4=YEH + packssdw mm6,mm4 ; mm6=YE + + psllw mm0,BYTE_BIT + por mm6,mm0 ; mm6=Y + movq MMWORD [edi], mm6 ; Save Y + + sub ecx, byte SIZEOF_MMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr + add edi, byte SIZEOF_MMWORD ; outptr0 + cmp ecx, byte SIZEOF_MMWORD + jae near .columnloop + test ecx,ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jcgryext-sse2-64.asm b/media/libjpeg/simd/jcgryext-sse2-64.asm new file mode 100644 index 000000000..541355af8 --- /dev/null +++ b/media/libjpeg/simd/jcgryext-sse2-64.asm @@ -0,0 +1,365 @@ +; +; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2) +; +; Copyright (C) 2011, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width, +; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, +; JDIMENSION output_row, int num_rows); +; + +; r10 = JDIMENSION img_width +; r11 = JSAMPARRAY input_buf +; r12 = JSAMPIMAGE output_buf +; r13 = JDIMENSION output_row +; r14 = int num_rows + +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + + global EXTN(jsimd_rgb_gray_convert_sse2) + +EXTN(jsimd_rgb_gray_convert_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + push rbx + + mov ecx, r10d + test rcx,rcx + jz near .return + + push rcx + + mov rsi, r12 + mov ecx, r13d + mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] + lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rsi, r11 + mov eax, r14d + test rax,rax + jle near .return +.rowloop: + push rdi + push rsi + push rcx ; col + + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr0 + + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push rax + push rdx + lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub rcx, byte SIZEOF_BYTE + movzx rax, BYTE [rsi+rcx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub rcx, byte SIZEOF_WORD + movzx rdx, WORD [rsi+rcx] + shl rax, WORD_BIT + or rax,rdx +.column_ld4: + movd xmmA,eax + pop rdx + pop rax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub rcx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [rsi+rcx] + pslldq xmmA, SIZEOF_DWORD + por xmmA,xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub rcx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [rsi+rcx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF,xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + mov rcx, SIZEOF_XMMWORD + jmp short .rgb_gray_cnv +.column_ld32: + test cl, 2*SIZEOF_XMMWORD + mov rcx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmB,xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + +.columnloop: + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] + +.rgb_gray_cnv: + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + movdqa xmmG,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + + punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + + movdqa xmmD,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + + punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + + movdqa xmmE,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + + punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + + pxor xmmH,xmmH + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmB,xmmE + punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + + movdqa xmmF,xmmD + punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub rcx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub rcx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmE +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub rcx, byte SIZEOF_XMMWORD/4 + movdqa xmmE,xmmA + movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + mov rcx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmF,xmmA + movdqa xmmH,xmmE + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + +.columnloop: + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] + +.rgb_gray_cnv: + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC,xmmF + punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB,xmmA + punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG,xmmD + punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE,xmmA + punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH,xmmB + punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF,xmmF + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD,xmmB + punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG,xmmE + punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF,xmmH + punpckhbw xmmH,xmmH + psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + movdqa xmm6,xmm1 + punpcklwd xmm1,xmm3 + punpckhwd xmm6,xmm3 + pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm6,xmm0 + punpcklwd xmm0,xmm2 + punpckhwd xmm6,xmm2 + pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa xmm0, xmm5 ; xmm0=BO + movdqa xmm6, xmm4 ; xmm6=BE + + movdqa xmm4,xmm0 + punpcklwd xmm0,xmm3 + punpckhwd xmm4,xmm3 + pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] + + paddd xmm0, xmm1 + paddd xmm4, xmm7 + paddd xmm0,xmm3 + paddd xmm4,xmm3 + psrld xmm0,SCALEBITS ; xmm0=YOL + psrld xmm4,SCALEBITS ; xmm4=YOH + packssdw xmm0,xmm4 ; xmm0=YO + + movdqa xmm4,xmm6 + punpcklwd xmm6,xmm2 + punpckhwd xmm4,xmm2 + pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(0)] + paddd xmm4, XMMWORD [wk(1)] + paddd xmm6,xmm2 + paddd xmm4,xmm2 + psrld xmm6,SCALEBITS ; xmm6=YEL + psrld xmm4,SCALEBITS ; xmm4=YEH + packssdw xmm6,xmm4 ; xmm6=YE + + psllw xmm0,BYTE_BIT + por xmm6,xmm0 ; xmm6=Y + movdqa XMMWORD [rdi], xmm6 ; Save Y + + sub rcx, byte SIZEOF_XMMWORD + add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add rdi, byte SIZEOF_XMMWORD ; outptr0 + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + test rcx,rcx + jnz near .column_ld1 + + pop rcx ; col + pop rsi + pop rdi + + add rsi, byte SIZEOF_JSAMPROW ; input_buf + add rdi, byte SIZEOF_JSAMPROW + dec rax ; num_rows + jg near .rowloop + +.return: + pop rbx + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jcgryext-sse2.asm b/media/libjpeg/simd/jcgryext-sse2.asm new file mode 100644 index 000000000..cd16dd192 --- /dev/null +++ b/media/libjpeg/simd/jcgryext-sse2.asm @@ -0,0 +1,384 @@ +; +; jcgryext.asm - grayscale colorspace conversion (SSE2) +; +; Copyright (C) 2011, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width, +; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, +; JDIMENSION output_row, int num_rows); +; + +%define img_width(b) (b)+8 ; JDIMENSION img_width +%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf +%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf +%define output_row(b) (b)+20 ; JDIMENSION output_row +%define num_rows(b) (b)+24 ; int num_rows + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr + + align 16 + + global EXTN(jsimd_rgb_gray_convert_sse2) + +EXTN(jsimd_rgb_gray_convert_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] + test ecx,ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax,eax + jle near .return + alignx 16,7 +.rowloop: + pushpic eax + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + alignx 16,7 + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push eax + push edx + lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + movzx eax, BYTE [esi+ecx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + movzx edx, WORD [esi+ecx] + shl eax, WORD_BIT + or eax,edx +.column_ld4: + movd xmmA,eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [esi+ecx] + pslldq xmmA, SIZEOF_DWORD + por xmmA,xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub ecx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [esi+ecx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF,xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + mov ecx, SIZEOF_XMMWORD + jmp short .rgb_gray_cnv +.column_ld32: + test cl, 2*SIZEOF_XMMWORD + mov ecx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmB,xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + alignx 16,7 + +.columnloop: + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] + +.rgb_gray_cnv: + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + movdqa xmmG,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + + punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + + movdqa xmmD,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + + punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + + movdqa xmmE,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + + punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + + pxor xmmH,xmmH + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmB,xmmE + punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + + movdqa xmmF,xmmD + punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub ecx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub ecx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmE +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub ecx, byte SIZEOF_XMMWORD/4 + movdqa xmmE,xmmA + movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + mov ecx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmF,xmmA + movdqa xmmH,xmmE + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + alignx 16,7 + +.columnloop: + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] + +.rgb_gray_cnv: + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC,xmmF + punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB,xmmA + punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG,xmmD + punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE,xmmA + punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH,xmmB + punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF,xmmF + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD,xmmB + punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG,xmmE + punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF,xmmH + punpckhbw xmmH,xmmH + psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + movdqa xmm6,xmm1 + punpcklwd xmm1,xmm3 + punpckhwd xmm6,xmm3 + pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm6,xmm0 + punpcklwd xmm0,xmm2 + punpckhwd xmm6,xmm2 + pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa xmm0, xmm5 ; xmm0=BO + movdqa xmm6, xmm4 ; xmm6=BE + + movdqa xmm4,xmm0 + punpcklwd xmm0,xmm3 + punpckhwd xmm4,xmm3 + pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] + + paddd xmm0, xmm1 + paddd xmm4, xmm7 + paddd xmm0,xmm3 + paddd xmm4,xmm3 + psrld xmm0,SCALEBITS ; xmm0=YOL + psrld xmm4,SCALEBITS ; xmm4=YOH + packssdw xmm0,xmm4 ; xmm0=YO + + movdqa xmm4,xmm6 + punpcklwd xmm6,xmm2 + punpckhwd xmm4,xmm2 + pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(0)] + paddd xmm4, XMMWORD [wk(1)] + paddd xmm6,xmm2 + paddd xmm4,xmm2 + psrld xmm6,SCALEBITS ; xmm6=YEL + psrld xmm4,SCALEBITS ; xmm4=YEH + packssdw xmm6,xmm4 ; xmm6=YE + + psllw xmm0,BYTE_BIT + por xmm6,xmm0 ; xmm6=Y + movdqa XMMWORD [edi], xmm6 ; Save Y + + sub ecx, byte SIZEOF_XMMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add edi, byte SIZEOF_XMMWORD ; outptr0 + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + test ecx,ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jchuff-sse2-64.asm b/media/libjpeg/simd/jchuff-sse2-64.asm new file mode 100644 index 000000000..b1144d1cd --- /dev/null +++ b/media/libjpeg/simd/jchuff-sse2-64.asm @@ -0,0 +1,360 @@ +; +; jchuff-sse2-64.asm - Huffman entropy encoding (64-bit SSE2) +; +; Copyright (C) 2009-2011, 2014-2016, D. R. Commander. +; Copyright (C) 2015, Matthieu Darbois. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains an SSE2 implementation for Huffman coding of one block. +; The following code is based directly on jchuff.c; see jchuff.c for more +; details. +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_huff_encode_one_block) + +EXTN(jconst_huff_encode_one_block): + +%include "jpeg_nbits_table.inc" + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +; These macros perform the same task as the emit_bits() function in the +; original libjpeg code. In addition to reducing overhead by explicitly +; inlining the code, additional performance is achieved by taking into +; account the size of the bit buffer and waiting until it is almost full +; before emptying it. This mostly benefits 64-bit platforms, since 6 +; bytes can be stored in a 64-bit bit buffer before it has to be emptied. + +%macro EMIT_BYTE 0 + sub put_bits, 8 ; put_bits -= 8; + mov rdx, put_buffer + mov ecx, put_bits + shr rdx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits); + mov byte [buffer], dl ; *buffer++ = c; + add buffer, 1 + cmp dl, 0xFF ; need to stuff a zero byte? + jne %%.EMIT_BYTE_END + mov byte [buffer], 0 ; *buffer++ = 0; + add buffer, 1 +%%.EMIT_BYTE_END: +%endmacro + +%macro PUT_BITS 1 + add put_bits, ecx ; put_bits += size; + shl put_buffer, cl ; put_buffer = (put_buffer << size); + or put_buffer, %1 +%endmacro + +%macro CHECKBUF31 0 + cmp put_bits, 32 ; if (put_bits > 31) { + jl %%.CHECKBUF31_END + EMIT_BYTE + EMIT_BYTE + EMIT_BYTE + EMIT_BYTE +%%.CHECKBUF31_END: +%endmacro + +%macro CHECKBUF47 0 + cmp put_bits, 48 ; if (put_bits > 47) { + jl %%.CHECKBUF47_END + EMIT_BYTE + EMIT_BYTE + EMIT_BYTE + EMIT_BYTE + EMIT_BYTE + EMIT_BYTE +%%.CHECKBUF47_END: +%endmacro + +%macro EMIT_BITS 2 + CHECKBUF47 + mov ecx, %2 + PUT_BITS %1 +%endmacro + +%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3) + pxor xmm8, xmm8 ; __m128i neg = _mm_setzero_si128(); + pxor xmm9, xmm9 ; __m128i neg = _mm_setzero_si128(); + pxor xmm10, xmm10 ; __m128i neg = _mm_setzero_si128(); + pxor xmm11, xmm11 ; __m128i neg = _mm_setzero_si128(); + pinsrw %34, word [r12 + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0]; + pinsrw %35, word [r12 + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8]; + pinsrw %36, word [r12 + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16]; + pinsrw %37, word [r12 + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24]; + pinsrw %34, word [r12 + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1]; + pinsrw %35, word [r12 + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9]; + pinsrw %36, word [r12 + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17]; + pinsrw %37, word [r12 + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25]; + pinsrw %34, word [r12 + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2]; + pinsrw %35, word [r12 + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10]; + pinsrw %36, word [r12 + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18]; + pinsrw %37, word [r12 + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26]; + pinsrw %34, word [r12 + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3]; + pinsrw %35, word [r12 + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11]; + pinsrw %36, word [r12 + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19]; + pinsrw %37, word [r12 + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27]; + pinsrw %34, word [r12 + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4]; + pinsrw %35, word [r12 + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12]; + pinsrw %36, word [r12 + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20]; + pinsrw %37, word [r12 + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28]; + pinsrw %34, word [r12 + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5]; + pinsrw %35, word [r12 + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13]; + pinsrw %36, word [r12 + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21]; + pinsrw %37, word [r12 + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29]; + pinsrw %34, word [r12 + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6]; + pinsrw %35, word [r12 + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14]; + pinsrw %36, word [r12 + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22]; + pinsrw %37, word [r12 + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30]; + pinsrw %34, word [r12 + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7]; + pinsrw %35, word [r12 + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15]; + pinsrw %36, word [r12 + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23]; +%if %1 != 32 + pinsrw %37, word [r12 + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31]; +%else + pinsrw %37, ebx, 7 ; xmm_shadow[31] = block[jno31]; +%endif + pcmpgtw xmm8, %34 ; neg = _mm_cmpgt_epi16(neg, x1); + pcmpgtw xmm9, %35 ; neg = _mm_cmpgt_epi16(neg, x1); + pcmpgtw xmm10, %36 ; neg = _mm_cmpgt_epi16(neg, x1); + pcmpgtw xmm11, %37 ; neg = _mm_cmpgt_epi16(neg, x1); + paddw %34, xmm8 ; x1 = _mm_add_epi16(x1, neg); + paddw %35, xmm9 ; x1 = _mm_add_epi16(x1, neg); + paddw %36, xmm10 ; x1 = _mm_add_epi16(x1, neg); + paddw %37, xmm11 ; x1 = _mm_add_epi16(x1, neg); + pxor %34, xmm8 ; x1 = _mm_xor_si128(x1, neg); + pxor %35, xmm9 ; x1 = _mm_xor_si128(x1, neg); + pxor %36, xmm10 ; x1 = _mm_xor_si128(x1, neg); + pxor %37, xmm11 ; x1 = _mm_xor_si128(x1, neg); + pxor xmm8, %34 ; neg = _mm_xor_si128(neg, x1); + pxor xmm9, %35 ; neg = _mm_xor_si128(neg, x1); + pxor xmm10, %36 ; neg = _mm_xor_si128(neg, x1); + pxor xmm11, %37 ; neg = _mm_xor_si128(neg, x1); + movdqa XMMWORD [t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1); + movdqa XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1); + movdqa XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1); + movdqa XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1); + movdqa XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg); + movdqa XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg); + movdqa XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg); + movdqa XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg); +%endmacro + +; +; Encode a single block's worth of coefficients. +; +; GLOBAL(JOCTET*) +; jsimd_huff_encode_one_block_sse2 (working_state *state, JOCTET *buffer, +; JCOEFPTR block, int last_dc_val, +; c_derived_tbl *dctbl, c_derived_tbl *actbl) +; + +; r10 = working_state *state +; r11 = JOCTET *buffer +; r12 = JCOEFPTR block +; r13 = int last_dc_val +; r14 = c_derived_tbl *dctbl +; r15 = c_derived_tbl *actbl + +%define t1 rbp-(DCTSIZE2*SIZEOF_WORD) +%define t2 t1-(DCTSIZE2*SIZEOF_WORD) +%define put_buffer r8 +%define put_bits r9d +%define buffer rax + + align 16 + global EXTN(jsimd_huff_encode_one_block_sse2) + +EXTN(jsimd_huff_encode_one_block_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [t2] + collect_args +%ifdef WIN64 + movaps XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8 + movaps XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9 + movaps XMMWORD [rsp-3*SIZEOF_XMMWORD], xmm10 + movaps XMMWORD [rsp-4*SIZEOF_XMMWORD], xmm11 + sub rsp, 4*SIZEOF_XMMWORD +%endif + push rbx + + mov buffer, r11 ; r11 is now sratch + + mov put_buffer, MMWORD [r10+16] ; put_buffer = state->cur.put_buffer; + mov put_bits, DWORD [r10+24] ; put_bits = state->cur.put_bits; + push r10 ; r10 is now scratch + + ; Encode the DC coefficient difference per section F.1.2.1 + movsx edi, word [r12] ; temp = temp2 = block[0] - last_dc_val; + sub edi, r13d ; r13 is not used anymore + mov ebx, edi + + ; This is a well-known technique for obtaining the absolute value + ; without a branch. It is derived from an assembly language technique + ; presented in "How to Optimize for the Pentium Processors", + ; Copyright (c) 1996, 1997 by Agner Fog. + mov esi, edi + sar esi, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); + xor edi, esi ; temp ^= temp3; + sub edi, esi ; temp -= temp3; + + ; For a negative input, want temp2 = bitwise complement of abs(input) + ; This code assumes we are on a two's complement machine + add ebx, esi ; temp2 += temp3; + + ; Find the number of bits needed for the magnitude of the coefficient + lea r11, [rel jpeg_nbits_table] + movzx rdi, byte [r11 + rdi] ; nbits = JPEG_NBITS(temp); + ; Emit the Huffman-coded symbol for the number of bits + mov r11d, INT [r14 + rdi * 4] ; code = dctbl->ehufco[nbits]; + movzx esi, byte [r14 + rdi + 1024] ; size = dctbl->ehufsi[nbits]; + EMIT_BITS r11, esi ; EMIT_BITS(code, size) + + ; Mask off any extra bits in code + mov esi, 1 + mov ecx, edi + shl esi, cl + dec esi + and ebx, esi ; temp2 &= (((JLONG) 1)<<nbits) - 1; + + ; Emit that number of bits of the value, if positive, + ; or the complement of its magnitude, if negative. + EMIT_BITS rbx, edi ; EMIT_BITS(temp2, nbits) + + ; Prepare data + xor ebx, ebx + kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \ + 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \ + 27, 20, 13, 6, 7, 14, 21, 28, 35, \ + xmm0, xmm1, xmm2, xmm3 + kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \ + 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \ + 53, 60, 61, 54, 47, 55, 62, 63, 63, \ + xmm4, xmm5, xmm6, xmm7 + + pxor xmm8, xmm8 + pcmpeqw xmm0, xmm8 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero); + pcmpeqw xmm1, xmm8 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero); + pcmpeqw xmm2, xmm8 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero); + pcmpeqw xmm3, xmm8 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero); + pcmpeqw xmm4, xmm8 ; tmp4 = _mm_cmpeq_epi16(tmp4, zero); + pcmpeqw xmm5, xmm8 ; tmp5 = _mm_cmpeq_epi16(tmp5, zero); + pcmpeqw xmm6, xmm8 ; tmp6 = _mm_cmpeq_epi16(tmp6, zero); + pcmpeqw xmm7, xmm8 ; tmp7 = _mm_cmpeq_epi16(tmp7, zero); + packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1); + packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3); + packsswb xmm4, xmm5 ; tmp4 = _mm_packs_epi16(tmp4, tmp5); + packsswb xmm6, xmm7 ; tmp6 = _mm_packs_epi16(tmp6, tmp7); + pmovmskb r11d, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0; + pmovmskb r12d, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16; + pmovmskb r13d, xmm4 ; index = ((uint64_t)_mm_movemask_epi8(tmp4)) << 32; + pmovmskb r14d, xmm6 ; index = ((uint64_t)_mm_movemask_epi8(tmp6)) << 48; + shl r12, 16 + shl r14, 16 + or r11, r12 + or r13, r14 + shl r13, 32 + or r11, r13 + not r11 ; index = ~index; + + ;mov MMWORD [ t1 + DCTSIZE2 * SIZEOF_WORD ], r11 + ;jmp .EFN + + mov r13d, INT [r15 + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0]; + movzx r14d, byte [r15 + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0]; + lea rsi, [t1] +.BLOOP: + bsf r12, r11 ; r = __builtin_ctzl(index); + jz .ELOOP + mov rcx, r12 + lea rsi, [rsi+r12*2] ; k += r; + shr r11, cl ; index >>= r; + movzx rdi, word [rsi] ; temp = t1[k]; + lea rbx, [rel jpeg_nbits_table] + movzx rdi, byte [rbx + rdi] ; nbits = JPEG_NBITS(temp); +.BRLOOP: + cmp r12, 16 ; while (r > 15) { + jl .ERLOOP + EMIT_BITS r13, r14d ; EMIT_BITS(code_0xf0, size_0xf0) + sub r12, 16 ; r -= 16; + jmp .BRLOOP +.ERLOOP: + ; Emit Huffman symbol for run length / number of bits + CHECKBUF31 ; uses rcx, rdx + + shl r12, 4 ; temp3 = (r << 4) + nbits; + add r12, rdi + mov ebx, INT [r15 + r12 * 4] ; code = actbl->ehufco[temp3]; + movzx ecx, byte [r15 + r12 + 1024] ; size = actbl->ehufsi[temp3]; + PUT_BITS rbx + + ;EMIT_CODE(code, size) + + movsx ebx, word [rsi-DCTSIZE2*2] ; temp2 = t2[k]; + ; Mask off any extra bits in code + mov rcx, rdi + mov rdx, 1 + shl rdx, cl + dec rdx + and rbx, rdx ; temp2 &= (((JLONG) 1)<<nbits) - 1; + PUT_BITS rbx ; PUT_BITS(temp2, nbits) + + shr r11, 1 ; index >>= 1; + add rsi, 2 ; ++k; + jmp .BLOOP +.ELOOP: + ; If the last coef(s) were zero, emit an end-of-block code + lea rdi, [t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k; + cmp rdi, rsi ; if (r > 0) { + je .EFN + mov ebx, INT [r15] ; code = actbl->ehufco[0]; + movzx r12d, byte [r15 + 1024] ; size = actbl->ehufsi[0]; + EMIT_BITS rbx, r12d +.EFN: + pop r10 + ; Save put_buffer & put_bits + mov MMWORD [r10+16], put_buffer ; state->cur.put_buffer = put_buffer; + mov DWORD [r10+24], put_bits ; state->cur.put_bits = put_bits; + + pop rbx +%ifdef WIN64 + movaps xmm11, XMMWORD [rsp+0*SIZEOF_XMMWORD] + movaps xmm10, XMMWORD [rsp+1*SIZEOF_XMMWORD] + movaps xmm9, XMMWORD [rsp+2*SIZEOF_XMMWORD] + movaps xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD] + add rsp, 4*SIZEOF_XMMWORD +%endif + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jchuff-sse2.asm b/media/libjpeg/simd/jchuff-sse2.asm new file mode 100644 index 000000000..36d1f2db6 --- /dev/null +++ b/media/libjpeg/simd/jchuff-sse2.asm @@ -0,0 +1,426 @@ +; +; jchuff-sse2.asm - Huffman entropy encoding (SSE2) +; +; Copyright (C) 2009-2011, 2014-2016, D. R. Commander. +; Copyright (C) 2015, Matthieu Darbois. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains an SSE2 implementation for Huffman coding of one block. +; The following code is based directly on jchuff.c; see jchuff.c for more +; details. +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_huff_encode_one_block) + +EXTN(jconst_huff_encode_one_block): + +%include "jpeg_nbits_table.inc" + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +; These macros perform the same task as the emit_bits() function in the +; original libjpeg code. In addition to reducing overhead by explicitly +; inlining the code, additional performance is achieved by taking into +; account the size of the bit buffer and waiting until it is almost full +; before emptying it. This mostly benefits 64-bit platforms, since 6 +; bytes can be stored in a 64-bit bit buffer before it has to be emptied. + +%macro EMIT_BYTE 0 + sub put_bits, 8 ; put_bits -= 8; + mov edx, put_buffer + mov ecx, put_bits + shr edx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits); + mov byte [eax], dl ; *buffer++ = c; + add eax, 1 + cmp dl, 0xFF ; need to stuff a zero byte? + jne %%.EMIT_BYTE_END + mov byte [eax], 0 ; *buffer++ = 0; + add eax, 1 +%%.EMIT_BYTE_END: +%endmacro + +%macro PUT_BITS 1 + add put_bits, ecx ; put_bits += size; + shl put_buffer, cl ; put_buffer = (put_buffer << size); + or put_buffer, %1 +%endmacro + +%macro CHECKBUF15 0 + cmp put_bits, 16 ; if (put_bits > 31) { + jl %%.CHECKBUF15_END + mov eax, POINTER [esp+buffer] + EMIT_BYTE + EMIT_BYTE + mov POINTER [esp+buffer], eax +%%.CHECKBUF15_END: +%endmacro + +%macro EMIT_BITS 1 + PUT_BITS %1 + CHECKBUF15 +%endmacro + +%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3) + pxor xmm4, xmm4 ; __m128i neg = _mm_setzero_si128(); + pxor xmm5, xmm5 ; __m128i neg = _mm_setzero_si128(); + pxor xmm6, xmm6 ; __m128i neg = _mm_setzero_si128(); + pxor xmm7, xmm7 ; __m128i neg = _mm_setzero_si128(); + pinsrw %34, word [esi + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0]; + pinsrw %35, word [esi + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8]; + pinsrw %36, word [esi + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16]; + pinsrw %37, word [esi + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24]; + pinsrw %34, word [esi + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1]; + pinsrw %35, word [esi + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9]; + pinsrw %36, word [esi + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17]; + pinsrw %37, word [esi + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25]; + pinsrw %34, word [esi + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2]; + pinsrw %35, word [esi + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10]; + pinsrw %36, word [esi + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18]; + pinsrw %37, word [esi + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26]; + pinsrw %34, word [esi + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3]; + pinsrw %35, word [esi + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11]; + pinsrw %36, word [esi + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19]; + pinsrw %37, word [esi + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27]; + pinsrw %34, word [esi + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4]; + pinsrw %35, word [esi + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12]; + pinsrw %36, word [esi + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20]; + pinsrw %37, word [esi + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28]; + pinsrw %34, word [esi + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5]; + pinsrw %35, word [esi + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13]; + pinsrw %36, word [esi + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21]; + pinsrw %37, word [esi + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29]; + pinsrw %34, word [esi + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6]; + pinsrw %35, word [esi + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14]; + pinsrw %36, word [esi + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22]; + pinsrw %37, word [esi + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30]; + pinsrw %34, word [esi + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7]; + pinsrw %35, word [esi + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15]; + pinsrw %36, word [esi + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23]; +%if %1 != 32 + pinsrw %37, word [esi + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31]; +%else + pinsrw %37, ecx, 7 ; xmm_shadow[31] = block[jno31]; +%endif + pcmpgtw xmm4, %34 ; neg = _mm_cmpgt_epi16(neg, x1); + pcmpgtw xmm5, %35 ; neg = _mm_cmpgt_epi16(neg, x1); + pcmpgtw xmm6, %36 ; neg = _mm_cmpgt_epi16(neg, x1); + pcmpgtw xmm7, %37 ; neg = _mm_cmpgt_epi16(neg, x1); + paddw %34, xmm4 ; x1 = _mm_add_epi16(x1, neg); + paddw %35, xmm5 ; x1 = _mm_add_epi16(x1, neg); + paddw %36, xmm6 ; x1 = _mm_add_epi16(x1, neg); + paddw %37, xmm7 ; x1 = _mm_add_epi16(x1, neg); + pxor %34, xmm4 ; x1 = _mm_xor_si128(x1, neg); + pxor %35, xmm5 ; x1 = _mm_xor_si128(x1, neg); + pxor %36, xmm6 ; x1 = _mm_xor_si128(x1, neg); + pxor %37, xmm7 ; x1 = _mm_xor_si128(x1, neg); + pxor xmm4, %34 ; neg = _mm_xor_si128(neg, x1); + pxor xmm5, %35 ; neg = _mm_xor_si128(neg, x1); + pxor xmm6, %36 ; neg = _mm_xor_si128(neg, x1); + pxor xmm7, %37 ; neg = _mm_xor_si128(neg, x1); + movdqa XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1); + movdqa XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1); + movdqa XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1); + movdqa XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1); + movdqa XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg); + movdqa XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg); + movdqa XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg); + movdqa XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg); +%endmacro + +; +; Encode a single block's worth of coefficients. +; +; GLOBAL(JOCTET*) +; jsimd_huff_encode_one_block_sse2 (working_state *state, JOCTET *buffer, +; JCOEFPTR block, int last_dc_val, +; c_derived_tbl *dctbl, c_derived_tbl *actbl) +; + +; eax + 8 = working_state *state +; eax + 12 = JOCTET *buffer +; eax + 16 = JCOEFPTR block +; eax + 20 = int last_dc_val +; eax + 24 = c_derived_tbl *dctbl +; eax + 28 = c_derived_tbl *actbl + +%define pad 6*SIZEOF_DWORD ; Align to 16 bytes +%define t1 pad +%define t2 t1+(DCTSIZE2*SIZEOF_WORD) +%define block t2+(DCTSIZE2*SIZEOF_WORD) +%define actbl block+SIZEOF_DWORD +%define buffer actbl+SIZEOF_DWORD +%define temp buffer+SIZEOF_DWORD +%define temp2 temp+SIZEOF_DWORD +%define temp3 temp2+SIZEOF_DWORD +%define temp4 temp3+SIZEOF_DWORD +%define temp5 temp4+SIZEOF_DWORD +%define gotptr temp5+SIZEOF_DWORD ; void *gotptr +%define put_buffer ebx +%define put_bits edi + + align 16 + global EXTN(jsimd_huff_encode_one_block_sse2) + +EXTN(jsimd_huff_encode_one_block_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + sub esp, temp5+9*SIZEOF_DWORD-pad + push ebx + push ecx +; push edx ; need not be preserved + push esi + push edi + push ebp + + mov esi, POINTER [eax+8] ; (working_state *state) + mov put_buffer, DWORD [esi+8] ; put_buffer = state->cur.put_buffer; + mov put_bits, DWORD [esi+12] ; put_bits = state->cur.put_bits; + push esi ; esi is now scratch + + get_GOT edx ; get GOT address + movpic POINTER [esp+gotptr], edx ; save GOT address + + mov ecx, POINTER [eax+28] + mov edx, POINTER [eax+16] + mov esi, POINTER [eax+12] + mov POINTER [esp+actbl], ecx + mov POINTER [esp+block], edx + mov POINTER [esp+buffer], esi + + ; Encode the DC coefficient difference per section F.1.2.1 + mov esi, POINTER [esp+block] ; block + movsx ecx, word [esi] ; temp = temp2 = block[0] - last_dc_val; + sub ecx, DWORD [eax+20] + mov esi, ecx + + ; This is a well-known technique for obtaining the absolute value + ; without a branch. It is derived from an assembly language technique + ; presented in "How to Optimize for the Pentium Processors", + ; Copyright (c) 1996, 1997 by Agner Fog. + mov edx, ecx + sar edx, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); + xor ecx, edx ; temp ^= temp3; + sub ecx, edx ; temp -= temp3; + + ; For a negative input, want temp2 = bitwise complement of abs(input) + ; This code assumes we are on a two's complement machine + add esi, edx ; temp2 += temp3; + mov DWORD [esp+temp], esi ; backup temp2 in temp + + ; Find the number of bits needed for the magnitude of the coefficient + movpic ebp, POINTER [esp+gotptr] ; load GOT address (ebp) + movzx edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)] ; nbits = JPEG_NBITS(temp); + mov DWORD [esp+temp2], edx ; backup nbits in temp2 + + ; Emit the Huffman-coded symbol for the number of bits + mov ebp, POINTER [eax+24] ; After this point, arguments are not accessible anymore + mov eax, INT [ebp + edx * 4] ; code = dctbl->ehufco[nbits]; + movzx ecx, byte [ebp + edx + 1024] ; size = dctbl->ehufsi[nbits]; + EMIT_BITS eax ; EMIT_BITS(code, size) + + mov ecx, DWORD [esp+temp2] ; restore nbits + + ; Mask off any extra bits in code + mov eax, 1 + shl eax, cl + dec eax + and eax, DWORD [esp+temp] ; temp2 &= (((JLONG) 1)<<nbits) - 1; + + ; Emit that number of bits of the value, if positive, + ; or the complement of its magnitude, if negative. + EMIT_BITS eax ; EMIT_BITS(temp2, nbits) + + ; Prepare data + xor ecx, ecx + mov esi, POINTER [esp+block] + kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \ + 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \ + 27, 20, 13, 6, 7, 14, 21, 28, 35, \ + xmm0, xmm1, xmm2, xmm3 + kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \ + 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \ + 53, 60, 61, 54, 47, 55, 62, 63, 63, \ + xmm0, xmm1, xmm2, xmm3 + + pxor xmm7, xmm7 + movdqa xmm0, XMMWORD [esp + t1 + 0 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0)); + movdqa xmm1, XMMWORD [esp + t1 + 8 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8)); + movdqa xmm2, XMMWORD [esp + t1 + 16 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16)); + movdqa xmm3, XMMWORD [esp + t1 + 24 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24)); + pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero); + pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero); + pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero); + pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero); + packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1); + packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3); + pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0; + pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16; + shl ecx, 16 + or edx, ecx + not edx ; index = ~index; + + lea esi, [esp+t1] + mov ebp, POINTER [esp+actbl] ; ebp = actbl + +.BLOOP: + bsf ecx, edx ; r = __builtin_ctzl(index); + jz .ELOOP + lea esi, [esi+ecx*2] ; k += r; + shr edx, cl ; index >>= r; + mov DWORD [esp+temp3], edx +.BRLOOP: + cmp ecx, 16 ; while (r > 15) { + jl .ERLOOP + sub ecx, 16 ; r -= 16; + mov DWORD [esp+temp], ecx + mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0]; + movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0]; + EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0) + mov ecx, DWORD [esp+temp] + jmp .BRLOOP +.ERLOOP: + movsx eax, word [esi] ; temp = t1[k]; + movpic edx, POINTER [esp+gotptr] ; load GOT address (edx) + movzx eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)] ; nbits = JPEG_NBITS(temp); + mov DWORD [esp+temp2], eax + ; Emit Huffman symbol for run length / number of bits + shl ecx, 4 ; temp3 = (r << 4) + nbits; + add ecx, eax + mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3]; + movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3]; + EMIT_BITS eax + + movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k]; + ; Mask off any extra bits in code + mov ecx, DWORD [esp+temp2] + mov eax, 1 + shl eax, cl + dec eax + and eax, edx ; temp2 &= (((JLONG) 1)<<nbits) - 1; + EMIT_BITS eax ; PUT_BITS(temp2, nbits) + mov edx, DWORD [esp+temp3] + add esi, 2 ; ++k; + shr edx, 1 ; index >>= 1; + + jmp .BLOOP +.ELOOP: + movdqa xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0)); + movdqa xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8)); + movdqa xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16)); + movdqa xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24)); + pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero); + pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero); + pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero); + pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero); + packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1); + packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3); + pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0; + pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16; + shl ecx, 16 + or edx, ecx + not edx ; index = ~index; + + lea eax, [esp + t1 + (DCTSIZE2/2) * 2] + sub eax, esi + shr eax, 1 + bsf ecx, edx ; r = __builtin_ctzl(index); + jz .ELOOP2 + shr edx, cl ; index >>= r; + add ecx, eax + lea esi, [esi+ecx*2] ; k += r; + mov DWORD [esp+temp3], edx + jmp .BRLOOP2 +.BLOOP2: + bsf ecx, edx ; r = __builtin_ctzl(index); + jz .ELOOP2 + lea esi, [esi+ecx*2] ; k += r; + shr edx, cl ; index >>= r; + mov DWORD [esp+temp3], edx +.BRLOOP2: + cmp ecx, 16 ; while (r > 15) { + jl .ERLOOP2 + sub ecx, 16 ; r -= 16; + mov DWORD [esp+temp], ecx + mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0]; + movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0]; + EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0) + mov ecx, DWORD [esp+temp] + jmp .BRLOOP2 +.ERLOOP2: + movsx eax, word [esi] ; temp = t1[k]; + bsr eax, eax ; nbits = 32 - __builtin_clz(temp); + inc eax + mov DWORD [esp+temp2], eax + ; Emit Huffman symbol for run length / number of bits + shl ecx, 4 ; temp3 = (r << 4) + nbits; + add ecx, eax + mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3]; + movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3]; + EMIT_BITS eax + + movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k]; + ; Mask off any extra bits in code + mov ecx, DWORD [esp+temp2] + mov eax, 1 + shl eax, cl + dec eax + and eax, edx ; temp2 &= (((JLONG) 1)<<nbits) - 1; + EMIT_BITS eax ; PUT_BITS(temp2, nbits) + mov edx, DWORD [esp+temp3] + add esi, 2 ; ++k; + shr edx, 1 ; index >>= 1; + + jmp .BLOOP2 +.ELOOP2: + ; If the last coef(s) were zero, emit an end-of-block code + lea edx, [esp + t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k; + cmp edx, esi ; if (r > 0) { + je .EFN + mov eax, INT [ebp] ; code = actbl->ehufco[0]; + movzx ecx, byte [ebp + 1024] ; size = actbl->ehufsi[0]; + EMIT_BITS eax +.EFN: + mov eax, [esp+buffer] + pop esi + ; Save put_buffer & put_bits + mov DWORD [esi+8], put_buffer ; state->cur.put_buffer = put_buffer; + mov DWORD [esi+12], put_bits ; state->cur.put_bits = put_bits; + + pop ebp + pop edi + pop esi +; pop edx ; need not be preserved + pop ecx + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jcolsamp.inc b/media/libjpeg/simd/jcolsamp.inc new file mode 100644 index 000000000..3be446e84 --- /dev/null +++ b/media/libjpeg/simd/jcolsamp.inc @@ -0,0 +1,104 @@ +; +; jcolsamp.inc - private declarations for color conversion & up/downsampling +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; [TAB8] + +; -------------------------------------------------------------------------- + +; pseudo-resisters to make ordering of RGB configurable +; +%if RGB_RED == 0 +%define mmA mm0 +%define mmB mm1 +%define xmmA xmm0 +%define xmmB xmm1 +%elif RGB_GREEN == 0 +%define mmA mm2 +%define mmB mm3 +%define xmmA xmm2 +%define xmmB xmm3 +%elif RGB_BLUE == 0 +%define mmA mm4 +%define mmB mm5 +%define xmmA xmm4 +%define xmmB xmm5 +%else +%define mmA mm6 +%define mmB mm7 +%define xmmA xmm6 +%define xmmB xmm7 +%endif + +%if RGB_RED == 1 +%define mmC mm0 +%define mmD mm1 +%define xmmC xmm0 +%define xmmD xmm1 +%elif RGB_GREEN == 1 +%define mmC mm2 +%define mmD mm3 +%define xmmC xmm2 +%define xmmD xmm3 +%elif RGB_BLUE == 1 +%define mmC mm4 +%define mmD mm5 +%define xmmC xmm4 +%define xmmD xmm5 +%else +%define mmC mm6 +%define mmD mm7 +%define xmmC xmm6 +%define xmmD xmm7 +%endif + +%if RGB_RED == 2 +%define mmE mm0 +%define mmF mm1 +%define xmmE xmm0 +%define xmmF xmm1 +%elif RGB_GREEN == 2 +%define mmE mm2 +%define mmF mm3 +%define xmmE xmm2 +%define xmmF xmm3 +%elif RGB_BLUE == 2 +%define mmE mm4 +%define mmF mm5 +%define xmmE xmm4 +%define xmmF xmm5 +%else +%define mmE mm6 +%define mmF mm7 +%define xmmE xmm6 +%define xmmF xmm7 +%endif + +%if RGB_RED == 3 +%define mmG mm0 +%define mmH mm1 +%define xmmG xmm0 +%define xmmH xmm1 +%elif RGB_GREEN == 3 +%define mmG mm2 +%define mmH mm3 +%define xmmG xmm2 +%define xmmH xmm3 +%elif RGB_BLUE == 3 +%define mmG mm4 +%define mmH mm5 +%define xmmG xmm4 +%define xmmH xmm5 +%else +%define mmG mm6 +%define mmH mm7 +%define xmmG xmm6 +%define xmmH xmm7 +%endif + +; -------------------------------------------------------------------------- diff --git a/media/libjpeg/simd/jcsample-altivec.c b/media/libjpeg/simd/jcsample-altivec.c new file mode 100644 index 000000000..11609d9da --- /dev/null +++ b/media/libjpeg/simd/jcsample-altivec.c @@ -0,0 +1,158 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* CHROMA DOWNSAMPLING */ + +#include "jsimd_altivec.h" +#include "jcsample.h" + + +void +jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, + JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + int outrow, outcol; + JDIMENSION output_cols = width_blocks * DCTSIZE; + JSAMPROW inptr, outptr; + + __vector unsigned char this0, next0, out; + __vector unsigned short this0e, this0o, next0e, next0o, outl, outh; + + /* Constants */ + __vector unsigned short pw_bias = { __4X2(0, 1) }, + pw_one = { __8X(1) }; + __vector unsigned char even_odd_index = + {0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15}, + pb_zero = { __16X(0) }; + + expand_right_edge(input_data, max_v_samp_factor, image_width, + output_cols * 2); + + for (outrow = 0; outrow < v_samp_factor; outrow++) { + outptr = output_data[outrow]; + inptr = input_data[outrow]; + + for (outcol = output_cols; outcol > 0; + outcol -= 16, inptr += 32, outptr += 16) { + + this0 = vec_ld(0, inptr); + this0 = vec_perm(this0, this0, even_odd_index); + this0e = (__vector unsigned short)VEC_UNPACKHU(this0); + this0o = (__vector unsigned short)VEC_UNPACKLU(this0); + outl = vec_add(this0e, this0o); + outl = vec_add(outl, pw_bias); + outl = vec_sr(outl, pw_one); + + if (outcol > 8) { + next0 = vec_ld(16, inptr); + next0 = vec_perm(next0, next0, even_odd_index); + next0e = (__vector unsigned short)VEC_UNPACKHU(next0); + next0o = (__vector unsigned short)VEC_UNPACKLU(next0); + outh = vec_add(next0e, next0o); + outh = vec_add(outh, pw_bias); + outh = vec_sr(outh, pw_one); + } else + outh = vec_splat_u16(0); + + out = vec_pack(outl, outh); + vec_st(out, 0, outptr); + } + } +} + + +void +jsimd_h2v2_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, + JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + int inrow, outrow, outcol; + JDIMENSION output_cols = width_blocks * DCTSIZE; + JSAMPROW inptr0, inptr1, outptr; + + __vector unsigned char this0, next0, this1, next1, out; + __vector unsigned short this0e, this0o, next0e, next0o, this1e, this1o, + next1e, next1o, out0l, out0h, out1l, out1h, outl, outh; + + /* Constants */ + __vector unsigned short pw_bias = { __4X2(1, 2) }, + pw_two = { __8X(2) }; + __vector unsigned char even_odd_index = + { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }, + pb_zero = { __16X(0) }; + + expand_right_edge(input_data, max_v_samp_factor, image_width, + output_cols * 2); + + for (inrow = 0, outrow = 0; outrow < v_samp_factor; + inrow += 2, outrow++) { + + inptr0 = input_data[inrow]; + inptr1 = input_data[inrow + 1]; + outptr = output_data[outrow]; + + for (outcol = output_cols; outcol > 0; + outcol -= 16, inptr0 += 32, inptr1 += 32, outptr += 16) { + + this0 = vec_ld(0, inptr0); + this0 = vec_perm(this0, this0, even_odd_index); + this0e = (__vector unsigned short)VEC_UNPACKHU(this0); + this0o = (__vector unsigned short)VEC_UNPACKLU(this0); + out0l = vec_add(this0e, this0o); + + this1 = vec_ld(0, inptr1); + this1 = vec_perm(this1, this1, even_odd_index); + this1e = (__vector unsigned short)VEC_UNPACKHU(this1); + this1o = (__vector unsigned short)VEC_UNPACKLU(this1); + out1l = vec_add(this1e, this1o); + + outl = vec_add(out0l, out1l); + outl = vec_add(outl, pw_bias); + outl = vec_sr(outl, pw_two); + + if (outcol > 8) { + next0 = vec_ld(16, inptr0); + next0 = vec_perm(next0, next0, even_odd_index); + next0e = (__vector unsigned short)VEC_UNPACKHU(next0); + next0o = (__vector unsigned short)VEC_UNPACKLU(next0); + out0h = vec_add(next0e, next0o); + + next1 = vec_ld(16, inptr1); + next1 = vec_perm(next1, next1, even_odd_index); + next1e = (__vector unsigned short)VEC_UNPACKHU(next1); + next1o = (__vector unsigned short)VEC_UNPACKLU(next1); + out1h = vec_add(next1e, next1o); + + outh = vec_add(out0h, out1h); + outh = vec_add(outh, pw_bias); + outh = vec_sr(outh, pw_two); + } else + outh = vec_splat_u16(0); + + out = vec_pack(outl, outh); + vec_st(out, 0, outptr); + } + } +} diff --git a/media/libjpeg/simd/jcsample-mmx.asm b/media/libjpeg/simd/jcsample-mmx.asm new file mode 100644 index 000000000..6cd544e74 --- /dev/null +++ b/media/libjpeg/simd/jcsample-mmx.asm @@ -0,0 +1,323 @@ +; +; jcsample.asm - downsampling (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Downsample pixel values of a single component. +; This version handles the common case of 2:1 horizontal and 1:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, JDIMENSION width_blocks, +; JSAMPARRAY input_data, JSAMPARRAY output_data); +; + +%define img_width(b) (b)+8 ; JDIMENSION image_width +%define max_v_samp(b) (b)+12 ; int max_v_samp_factor +%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor +%define width_blks(b) (b)+20 ; JDIMENSION width_blocks +%define input_data(b) (b)+24 ; JSAMPARRAY input_data +%define output_data(b) (b)+28 ; JSAMPARRAY output_data + + align 16 + global EXTN(jsimd_h2v1_downsample_mmx) + +EXTN(jsimd_h2v1_downsample_mmx): + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx,1 ; output_cols * 2 + sub ecx,edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax,eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16,7 +.expandloop: + push eax + push ecx + + mov edi, JSAMPROW [esi] + add edi,edx + mov al, JSAMPLE [edi-1] + + rep stosb + + pop ecx + pop eax + + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop + +.expand_end: + pop ecx ; output_cols + + ; -- h2v1_downsample + + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax,eax + jle near .return + + mov edx, 0x00010000 ; bias pattern + movd mm7,edx + pcmpeqw mm6,mm6 + punpckldq mm7,mm7 ; mm7={0, 1, 0, 1} + psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16,7 +.rowloop: + push ecx + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + alignx 16,7 +.columnloop: + + movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] + movq mm1, MMWORD [esi+1*SIZEOF_MMWORD] + movq mm2,mm0 + movq mm3,mm1 + + pand mm0,mm6 + psrlw mm2,BYTE_BIT + pand mm1,mm6 + psrlw mm3,BYTE_BIT + + paddw mm0,mm2 + paddw mm1,mm3 + paddw mm0,mm7 + paddw mm1,mm7 + psrlw mm0,1 + psrlw mm1,1 + + packuswb mm0,mm1 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 + + add esi, byte 2*SIZEOF_MMWORD ; inptr + add edi, byte 1*SIZEOF_MMWORD ; outptr + sub ecx, byte SIZEOF_MMWORD ; outcol + jnz short .columnloop + + pop esi + pop edi + pop ecx + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg short .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Downsample pixel values of a single component. +; This version handles the standard case of 2:1 horizontal and 2:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, JDIMENSION width_blocks, +; JSAMPARRAY input_data, JSAMPARRAY output_data); +; + +%define img_width(b) (b)+8 ; JDIMENSION image_width +%define max_v_samp(b) (b)+12 ; int max_v_samp_factor +%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor +%define width_blks(b) (b)+20 ; JDIMENSION width_blocks +%define input_data(b) (b)+24 ; JSAMPARRAY input_data +%define output_data(b) (b)+28 ; JSAMPARRAY output_data + + align 16 + global EXTN(jsimd_h2v2_downsample_mmx) + +EXTN(jsimd_h2v2_downsample_mmx): + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx,1 ; output_cols * 2 + sub ecx,edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax,eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16,7 +.expandloop: + push eax + push ecx + + mov edi, JSAMPROW [esi] + add edi,edx + mov al, JSAMPLE [edi-1] + + rep stosb + + pop ecx + pop eax + + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop + +.expand_end: + pop ecx ; output_cols + + ; -- h2v2_downsample + + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax,eax + jle near .return + + mov edx, 0x00020001 ; bias pattern + movd mm7,edx + pcmpeqw mm6,mm6 + punpckldq mm7,mm7 ; mm7={1, 2, 1, 2} + psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16,7 +.rowloop: + push ecx + push edi + push esi + + mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 + mov edi, JSAMPROW [edi] ; outptr + alignx 16,7 +.columnloop: + + movq mm0, MMWORD [edx+0*SIZEOF_MMWORD] + movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] + movq mm2, MMWORD [edx+1*SIZEOF_MMWORD] + movq mm3, MMWORD [esi+1*SIZEOF_MMWORD] + + movq mm4,mm0 + movq mm5,mm1 + pand mm0,mm6 + psrlw mm4,BYTE_BIT + pand mm1,mm6 + psrlw mm5,BYTE_BIT + paddw mm0,mm4 + paddw mm1,mm5 + + movq mm4,mm2 + movq mm5,mm3 + pand mm2,mm6 + psrlw mm4,BYTE_BIT + pand mm3,mm6 + psrlw mm5,BYTE_BIT + paddw mm2,mm4 + paddw mm3,mm5 + + paddw mm0,mm1 + paddw mm2,mm3 + paddw mm0,mm7 + paddw mm2,mm7 + psrlw mm0,2 + psrlw mm2,2 + + packuswb mm0,mm2 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 + + add edx, byte 2*SIZEOF_MMWORD ; inptr0 + add esi, byte 2*SIZEOF_MMWORD ; inptr1 + add edi, byte 1*SIZEOF_MMWORD ; outptr + sub ecx, byte SIZEOF_MMWORD ; outcol + jnz near .columnloop + + pop esi + pop edi + pop ecx + + add esi, byte 2*SIZEOF_JSAMPROW ; input_data + add edi, byte 1*SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jcsample-sse2-64.asm b/media/libjpeg/simd/jcsample-sse2-64.asm new file mode 100644 index 000000000..40ee15fcb --- /dev/null +++ b/media/libjpeg/simd/jcsample-sse2-64.asm @@ -0,0 +1,329 @@ +; +; jcsample.asm - downsampling (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Downsample pixel values of a single component. +; This version handles the common case of 2:1 horizontal and 1:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, JDIMENSION width_blocks, +; JSAMPARRAY input_data, JSAMPARRAY output_data); +; + +; r10 = JDIMENSION image_width +; r11 = int max_v_samp_factor +; r12 = JDIMENSION v_samp_factor +; r13 = JDIMENSION width_blocks +; r14 = JSAMPARRAY input_data +; r15 = JSAMPARRAY output_data + + align 16 + global EXTN(jsimd_h2v1_downsample_sse2) + +EXTN(jsimd_h2v1_downsample_sse2): + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + + mov ecx, r13d + shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) + jz near .return + + mov edx, r10d + + ; -- expand_right_edge + + push rcx + shl rcx,1 ; output_cols * 2 + sub rcx,rdx + jle short .expand_end + + mov rax, r11 + test rax,rax + jle short .expand_end + + cld + mov rsi, r14 ; input_data +.expandloop: + push rax + push rcx + + mov rdi, JSAMPROW [rsi] + add rdi,rdx + mov al, JSAMPLE [rdi-1] + + rep stosb + + pop rcx + pop rax + + add rsi, byte SIZEOF_JSAMPROW + dec rax + jg short .expandloop + +.expand_end: + pop rcx ; output_cols + + ; -- h2v1_downsample + + mov eax, r12d ; rowctr + test eax,eax + jle near .return + + mov rdx, 0x00010000 ; bias pattern + movd xmm7,edx + pcmpeqw xmm6,xmm6 + pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} + psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + + mov rsi, r14 ; input_data + mov rdi, r15 ; output_data +.rowloop: + push rcx + push rdi + push rsi + + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr + + cmp rcx, byte SIZEOF_XMMWORD + jae short .columnloop + +.columnloop_r8: + movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + pxor xmm1,xmm1 + mov rcx, SIZEOF_XMMWORD + jmp short .downsample + +.columnloop: + movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD] + +.downsample: + movdqa xmm2,xmm0 + movdqa xmm3,xmm1 + + pand xmm0,xmm6 + psrlw xmm2,BYTE_BIT + pand xmm1,xmm6 + psrlw xmm3,BYTE_BIT + + paddw xmm0,xmm2 + paddw xmm1,xmm3 + paddw xmm0,xmm7 + paddw xmm1,xmm7 + psrlw xmm0,1 + psrlw xmm1,1 + + packuswb xmm0,xmm1 + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + + sub rcx, byte SIZEOF_XMMWORD ; outcol + add rsi, byte 2*SIZEOF_XMMWORD ; inptr + add rdi, byte 1*SIZEOF_XMMWORD ; outptr + cmp rcx, byte SIZEOF_XMMWORD + jae short .columnloop + test rcx,rcx + jnz short .columnloop_r8 + + pop rsi + pop rdi + pop rcx + + add rsi, byte SIZEOF_JSAMPROW ; input_data + add rdi, byte SIZEOF_JSAMPROW ; output_data + dec rax ; rowctr + jg near .rowloop + +.return: + uncollect_args + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Downsample pixel values of a single component. +; This version handles the standard case of 2:1 horizontal and 2:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, JDIMENSION width_blocks, +; JSAMPARRAY input_data, JSAMPARRAY output_data); +; + +; r10 = JDIMENSION image_width +; r11 = int max_v_samp_factor +; r12 = JDIMENSION v_samp_factor +; r13 = JDIMENSION width_blocks +; r14 = JSAMPARRAY input_data +; r15 = JSAMPARRAY output_data + + align 16 + global EXTN(jsimd_h2v2_downsample_sse2) + +EXTN(jsimd_h2v2_downsample_sse2): + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + + mov ecx, r13d + shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) + jz near .return + + mov edx, r10d + + ; -- expand_right_edge + + push rcx + shl rcx,1 ; output_cols * 2 + sub rcx,rdx + jle short .expand_end + + mov rax, r11 + test rax,rax + jle short .expand_end + + cld + mov rsi, r14 ; input_data +.expandloop: + push rax + push rcx + + mov rdi, JSAMPROW [rsi] + add rdi,rdx + mov al, JSAMPLE [rdi-1] + + rep stosb + + pop rcx + pop rax + + add rsi, byte SIZEOF_JSAMPROW + dec rax + jg short .expandloop + +.expand_end: + pop rcx ; output_cols + + ; -- h2v2_downsample + + mov eax, r12d ; rowctr + test rax,rax + jle near .return + + mov rdx, 0x00020001 ; bias pattern + movd xmm7,edx + pcmpeqw xmm6,xmm6 + pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} + psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + + mov rsi, r14 ; input_data + mov rdi, r15 ; output_data +.rowloop: + push rcx + push rdi + push rsi + + mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 + mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 + mov rdi, JSAMPROW [rdi] ; outptr + + cmp rcx, byte SIZEOF_XMMWORD + jae short .columnloop + +.columnloop_r8: + movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] + pxor xmm2,xmm2 + pxor xmm3,xmm3 + mov rcx, SIZEOF_XMMWORD + jmp short .downsample + +.columnloop: + movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD] + +.downsample: + movdqa xmm4,xmm0 + movdqa xmm5,xmm1 + pand xmm0,xmm6 + psrlw xmm4,BYTE_BIT + pand xmm1,xmm6 + psrlw xmm5,BYTE_BIT + paddw xmm0,xmm4 + paddw xmm1,xmm5 + + movdqa xmm4,xmm2 + movdqa xmm5,xmm3 + pand xmm2,xmm6 + psrlw xmm4,BYTE_BIT + pand xmm3,xmm6 + psrlw xmm5,BYTE_BIT + paddw xmm2,xmm4 + paddw xmm3,xmm5 + + paddw xmm0,xmm1 + paddw xmm2,xmm3 + paddw xmm0,xmm7 + paddw xmm2,xmm7 + psrlw xmm0,2 + psrlw xmm2,2 + + packuswb xmm0,xmm2 + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + + sub rcx, byte SIZEOF_XMMWORD ; outcol + add rdx, byte 2*SIZEOF_XMMWORD ; inptr0 + add rsi, byte 2*SIZEOF_XMMWORD ; inptr1 + add rdi, byte 1*SIZEOF_XMMWORD ; outptr + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + test rcx,rcx + jnz near .columnloop_r8 + + pop rsi + pop rdi + pop rcx + + add rsi, byte 2*SIZEOF_JSAMPROW ; input_data + add rdi, byte 1*SIZEOF_JSAMPROW ; output_data + dec rax ; rowctr + jg near .rowloop + +.return: + uncollect_args + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jcsample-sse2.asm b/media/libjpeg/simd/jcsample-sse2.asm new file mode 100644 index 000000000..83c9d152a --- /dev/null +++ b/media/libjpeg/simd/jcsample-sse2.asm @@ -0,0 +1,350 @@ +; +; jcsample.asm - downsampling (SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Downsample pixel values of a single component. +; This version handles the common case of 2:1 horizontal and 1:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, JDIMENSION width_blocks, +; JSAMPARRAY input_data, JSAMPARRAY output_data); +; + +%define img_width(b) (b)+8 ; JDIMENSION image_width +%define max_v_samp(b) (b)+12 ; int max_v_samp_factor +%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor +%define width_blks(b) (b)+20 ; JDIMENSION width_blocks +%define input_data(b) (b)+24 ; JSAMPARRAY input_data +%define output_data(b) (b)+28 ; JSAMPARRAY output_data + + align 16 + global EXTN(jsimd_h2v1_downsample_sse2) + +EXTN(jsimd_h2v1_downsample_sse2): + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx,1 ; output_cols * 2 + sub ecx,edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax,eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16,7 +.expandloop: + push eax + push ecx + + mov edi, JSAMPROW [esi] + add edi,edx + mov al, JSAMPLE [edi-1] + + rep stosb + + pop ecx + pop eax + + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop + +.expand_end: + pop ecx ; output_cols + + ; -- h2v1_downsample + + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax,eax + jle near .return + + mov edx, 0x00010000 ; bias pattern + movd xmm7,edx + pcmpeqw xmm6,xmm6 + pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} + psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16,7 +.rowloop: + push ecx + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + + cmp ecx, byte SIZEOF_XMMWORD + jae short .columnloop + alignx 16,7 + +.columnloop_r8: + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + pxor xmm1,xmm1 + mov ecx, SIZEOF_XMMWORD + jmp short .downsample + alignx 16,7 + +.columnloop: + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD] + +.downsample: + movdqa xmm2,xmm0 + movdqa xmm3,xmm1 + + pand xmm0,xmm6 + psrlw xmm2,BYTE_BIT + pand xmm1,xmm6 + psrlw xmm3,BYTE_BIT + + paddw xmm0,xmm2 + paddw xmm1,xmm3 + paddw xmm0,xmm7 + paddw xmm1,xmm7 + psrlw xmm0,1 + psrlw xmm1,1 + + packuswb xmm0,xmm1 + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + + sub ecx, byte SIZEOF_XMMWORD ; outcol + add esi, byte 2*SIZEOF_XMMWORD ; inptr + add edi, byte 1*SIZEOF_XMMWORD ; outptr + cmp ecx, byte SIZEOF_XMMWORD + jae short .columnloop + test ecx,ecx + jnz short .columnloop_r8 + + pop esi + pop edi + pop ecx + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Downsample pixel values of a single component. +; This version handles the standard case of 2:1 horizontal and 2:1 vertical, +; without smoothing. +; +; GLOBAL(void) +; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, +; JDIMENSION v_samp_factor, JDIMENSION width_blocks, +; JSAMPARRAY input_data, JSAMPARRAY output_data); +; + +%define img_width(b) (b)+8 ; JDIMENSION image_width +%define max_v_samp(b) (b)+12 ; int max_v_samp_factor +%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor +%define width_blks(b) (b)+20 ; JDIMENSION width_blocks +%define input_data(b) (b)+24 ; JSAMPARRAY input_data +%define output_data(b) (b)+28 ; JSAMPARRAY output_data + + align 16 + global EXTN(jsimd_h2v2_downsample_sse2) + +EXTN(jsimd_h2v2_downsample_sse2): + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx,1 ; output_cols * 2 + sub ecx,edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax,eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16,7 +.expandloop: + push eax + push ecx + + mov edi, JSAMPROW [esi] + add edi,edx + mov al, JSAMPLE [edi-1] + + rep stosb + + pop ecx + pop eax + + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop + +.expand_end: + pop ecx ; output_cols + + ; -- h2v2_downsample + + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax,eax + jle near .return + + mov edx, 0x00020001 ; bias pattern + movd xmm7,edx + pcmpeqw xmm6,xmm6 + pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} + psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16,7 +.rowloop: + push ecx + push edi + push esi + + mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 + mov edi, JSAMPROW [edi] ; outptr + + cmp ecx, byte SIZEOF_XMMWORD + jae short .columnloop + alignx 16,7 + +.columnloop_r8: + movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] + pxor xmm2,xmm2 + pxor xmm3,xmm3 + mov ecx, SIZEOF_XMMWORD + jmp short .downsample + alignx 16,7 + +.columnloop: + movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD] + +.downsample: + movdqa xmm4,xmm0 + movdqa xmm5,xmm1 + pand xmm0,xmm6 + psrlw xmm4,BYTE_BIT + pand xmm1,xmm6 + psrlw xmm5,BYTE_BIT + paddw xmm0,xmm4 + paddw xmm1,xmm5 + + movdqa xmm4,xmm2 + movdqa xmm5,xmm3 + pand xmm2,xmm6 + psrlw xmm4,BYTE_BIT + pand xmm3,xmm6 + psrlw xmm5,BYTE_BIT + paddw xmm2,xmm4 + paddw xmm3,xmm5 + + paddw xmm0,xmm1 + paddw xmm2,xmm3 + paddw xmm0,xmm7 + paddw xmm2,xmm7 + psrlw xmm0,2 + psrlw xmm2,2 + + packuswb xmm0,xmm2 + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + + sub ecx, byte SIZEOF_XMMWORD ; outcol + add edx, byte 2*SIZEOF_XMMWORD ; inptr0 + add esi, byte 2*SIZEOF_XMMWORD ; inptr1 + add edi, byte 1*SIZEOF_XMMWORD ; outptr + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + test ecx,ecx + jnz near .columnloop_r8 + + pop esi + pop edi + pop ecx + + add esi, byte 2*SIZEOF_JSAMPROW ; input_data + add edi, byte 1*SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jcsample.h b/media/libjpeg/simd/jcsample.h new file mode 100644 index 000000000..2a50544e9 --- /dev/null +++ b/media/libjpeg/simd/jcsample.h @@ -0,0 +1,28 @@ +/* + * jcsample.h + * + * This file was part of the Independent JPEG Group's software: + * Copyright (C) 1991-1996, Thomas G. Lane. + * For conditions of distribution and use, see the accompanying README.ijg + * file. + */ + +LOCAL(void) +expand_right_edge (JSAMPARRAY image_data, int num_rows, + JDIMENSION input_cols, JDIMENSION output_cols) +{ + register JSAMPROW ptr; + register JSAMPLE pixval; + register int count; + int row; + int numcols = (int) (output_cols - input_cols); + + if (numcols > 0) { + for (row = 0; row < num_rows; row++) { + ptr = image_data[row] + input_cols; + pixval = ptr[-1]; /* don't need GETJSAMPLE() here */ + for (count = numcols; count > 0; count--) + *ptr++ = pixval; + } + } +} diff --git a/media/libjpeg/simd/jdcolext-altivec.c b/media/libjpeg/simd/jdcolext-altivec.c new file mode 100644 index 000000000..fb121ce74 --- /dev/null +++ b/media/libjpeg/simd/jdcolext-altivec.c @@ -0,0 +1,274 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jdcolor-altivec.c */ + + +void jsimd_ycc_rgb_convert_altivec (JDIMENSION out_width, JSAMPIMAGE input_buf, + JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + JSAMPROW outptr, inptr0, inptr1, inptr2; + int pitch = out_width * RGB_PIXELSIZE, num_cols; +#if __BIG_ENDIAN__ + int offset; +#endif + unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; + + __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3, + y, cb, cr; +#if __BIG_ENDIAN__ + __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3; +#if RGB_PIXELSIZE == 4 + __vector unsigned char out4; +#endif +#endif +#if RGB_PIXELSIZE == 4 + __vector unsigned char rgb3; +#endif + __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, yl, yh, cbl, cbh, + crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w; + __vector int g0, g1, g2, g3; + + /* Constants + * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17 + * high-order bits, not 16. + */ + __vector short pw_f0402 = { __8X(F_0_402 >> 1) }, + pw_mf0228 = { __8X(-F_0_228 >> 1) }, + pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) }, + pw_one = { __8X(1) }, pw_255 = { __8X(255) }, + pw_cj = { __8X(CENTERJSAMPLE) }; + __vector int pd_onehalf = { __4X(ONE_HALF) }; + __vector unsigned char pb_zero = { __16X(0) }, +#if __BIG_ENDIAN__ + shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29}; +#else + shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31}; +#endif + + while (--num_rows >= 0) { + inptr0 = input_buf[0][input_row]; + inptr1 = input_buf[1][input_row]; + inptr2 = input_buf[2][input_row]; + input_row++; + outptr = *output_buf++; + + for (num_cols = pitch; num_cols > 0; + num_cols -= RGB_PIXELSIZE * 16, outptr += RGB_PIXELSIZE * 16, + inptr0 += 16, inptr1 += 16, inptr2 += 16) { + + y = vec_ld(0, inptr0); + /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't + * support unsigned vectors. + */ + yl = (__vector signed short)VEC_UNPACKHU(y); + yh = (__vector signed short)VEC_UNPACKLU(y); + + cb = vec_ld(0, inptr1); + cbl = (__vector signed short)VEC_UNPACKHU(cb); + cbh = (__vector signed short)VEC_UNPACKLU(cb); + cbl = vec_sub(cbl, pw_cj); + cbh = vec_sub(cbh, pw_cj); + + cr = vec_ld(0, inptr2); + crl = (__vector signed short)VEC_UNPACKHU(cr); + crh = (__vector signed short)VEC_UNPACKLU(cr); + crl = vec_sub(crl, pw_cj); + crh = vec_sub(crh, pw_cj); + + /* (Original) + * R = Y + 1.40200 * Cr + * G = Y - 0.34414 * Cb - 0.71414 * Cr + * B = Y + 1.77200 * Cb + * + * (This implementation) + * R = Y + 0.40200 * Cr + Cr + * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + * B = Y - 0.22800 * Cb + Cb + Cb + */ + bl = vec_add(cbl, cbl); + bh = vec_add(cbh, cbh); + bl = vec_madds(bl, pw_mf0228, pw_one); + bh = vec_madds(bh, pw_mf0228, pw_one); + bl = vec_sra(bl, (__vector unsigned short)pw_one); + bh = vec_sra(bh, (__vector unsigned short)pw_one); + bl = vec_add(bl, cbl); + bh = vec_add(bh, cbh); + bl = vec_add(bl, cbl); + bh = vec_add(bh, cbh); + bl = vec_add(bl, yl); + bh = vec_add(bh, yh); + + rl = vec_add(crl, crl); + rh = vec_add(crh, crh); + rl = vec_madds(rl, pw_f0402, pw_one); + rh = vec_madds(rh, pw_f0402, pw_one); + rl = vec_sra(rl, (__vector unsigned short)pw_one); + rh = vec_sra(rh, (__vector unsigned short)pw_one); + rl = vec_add(rl, crl); + rh = vec_add(rh, crh); + rl = vec_add(rl, yl); + rh = vec_add(rh, yh); + + g0w = vec_mergeh(cbl, crl); + g1w = vec_mergel(cbl, crl); + g0 = vec_msums(g0w, pw_mf0344_f0285, pd_onehalf); + g1 = vec_msums(g1w, pw_mf0344_f0285, pd_onehalf); + g2w = vec_mergeh(cbh, crh); + g3w = vec_mergel(cbh, crh); + g2 = vec_msums(g2w, pw_mf0344_f0285, pd_onehalf); + g3 = vec_msums(g3w, pw_mf0344_f0285, pd_onehalf); + /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from + * each dword into a new 16-bit vector, which is the equivalent of + * descaling the 32-bit results (right-shifting by 16 bits) and then + * packing them. + */ + gl = vec_perm((__vector short)g0, (__vector short)g1, shift_pack_index); + gh = vec_perm((__vector short)g2, (__vector short)g3, shift_pack_index); + gl = vec_sub(gl, crl); + gh = vec_sub(gh, crh); + gl = vec_add(gl, yl); + gh = vec_add(gh, yh); + + rg0 = vec_mergeh(rl, gl); + bx0 = vec_mergeh(bl, pw_255); + rg1 = vec_mergel(rl, gl); + bx1 = vec_mergel(bl, pw_255); + rg2 = vec_mergeh(rh, gh); + bx2 = vec_mergeh(bh, pw_255); + rg3 = vec_mergel(rh, gh); + bx3 = vec_mergel(bh, pw_255); + + rgbx0 = vec_packsu(rg0, bx0); + rgbx1 = vec_packsu(rg1, bx1); + rgbx2 = vec_packsu(rg2, bx2); + rgbx3 = vec_packsu(rg3, bx3); + +#if RGB_PIXELSIZE == 3 + /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3 + * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7 + * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb + * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf + * + * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 + * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga + * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf + */ + rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0); + rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1); + rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2); +#else + /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3 + * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7 + * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb + * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf + * + * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3 + * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7 + * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb + * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf + */ + rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX); + rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX); + rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX); + rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX); +#endif + +#if __BIG_ENDIAN__ + offset = (size_t)outptr & 15; + if (offset) { + __vector unsigned char unaligned_shift_index; + int bytes = num_cols + offset; + + if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) { + /* Slow path to prevent buffer overwrite. Since there is no way to + * write a partial AltiVec register, overwrite would occur on the + * last chunk of the last image row if the right edge is not on a + * 16-byte boundary. It could also occur on other rows if the bytes + * per row is low enough. Since we can't determine whether we're on + * the last image row, we have to assume every row is the last. + */ + vec_st(rgb0, 0, tmpbuf); + vec_st(rgb1, 16, tmpbuf); + vec_st(rgb2, 32, tmpbuf); +#if RGB_PIXELSIZE == 4 + vec_st(rgb3, 48, tmpbuf); +#endif + memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16)); + } else { + /* Fast path */ + unaligned_shift_index = vec_lvsl(0, outptr); + edgel = vec_ld(0, outptr); + edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr); + edges = vec_perm(edgeh, edgel, unaligned_shift_index); + unaligned_shift_index = vec_lvsr(0, outptr); + out0 = vec_perm(edges, rgb0, unaligned_shift_index); + out1 = vec_perm(rgb0, rgb1, unaligned_shift_index); + out2 = vec_perm(rgb1, rgb2, unaligned_shift_index); +#if RGB_PIXELSIZE == 4 + out3 = vec_perm(rgb2, rgb3, unaligned_shift_index); + out4 = vec_perm(rgb3, edges, unaligned_shift_index); +#else + out3 = vec_perm(rgb2, edges, unaligned_shift_index); +#endif + vec_st(out0, 0, outptr); + if (bytes > 16) + vec_st(out1, 16, outptr); + if (bytes > 32) + vec_st(out2, 32, outptr); + if (bytes > 48) + vec_st(out3, 48, outptr); +#if RGB_PIXELSIZE == 4 + if (bytes > 64) + vec_st(out4, 64, outptr); +#endif + } + } else { +#endif /* __BIG_ENDIAN__ */ + if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { + /* Slow path */ + VEC_ST(rgb0, 0, tmpbuf); + VEC_ST(rgb1, 16, tmpbuf); + VEC_ST(rgb2, 32, tmpbuf); +#if RGB_PIXELSIZE == 4 + VEC_ST(rgb3, 48, tmpbuf); +#endif + memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16)); + } else { + /* Fast path */ + VEC_ST(rgb0, 0, outptr); + if (num_cols > 16) + VEC_ST(rgb1, 16, outptr); + if (num_cols > 32) + VEC_ST(rgb2, 32, outptr); +#if RGB_PIXELSIZE == 4 + if (num_cols > 48) + VEC_ST(rgb3, 48, outptr); +#endif + } +#if __BIG_ENDIAN__ + } +#endif + } + } +} diff --git a/media/libjpeg/simd/jdcolext-mmx.asm b/media/libjpeg/simd/jdcolext-mmx.asm new file mode 100644 index 000000000..21e34f678 --- /dev/null +++ b/media/libjpeg/simd/jdcolext-mmx.asm @@ -0,0 +1,404 @@ +; +; jdcolext.asm - colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_ycc_rgb_convert_mmx (JDIMENSION out_width, +; JSAMPIMAGE input_buf, JDIMENSION input_row, +; JSAMPARRAY output_buf, int num_rows) +; + +%define out_width(b) (b)+8 ; JDIMENSION out_width +%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf +%define input_row(b) (b)+16 ; JDIMENSION input_row +%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf +%define num_rows(b) (b)+24 ; int num_rows + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr + + align 16 + global EXTN(jsimd_ycc_rgb_convert_mmx) + +EXTN(jsimd_ycc_rgb_convert_mmx): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [out_width(eax)] ; num_cols + test ecx,ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [input_row(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov edi, JSAMPARRAY [output_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax,eax + jle near .return + alignx 16,7 +.rowloop: + push eax + push edi + push edx + push ebx + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr0 + mov ebx, JSAMPROW [ebx] ; inptr1 + mov edx, JSAMPROW [edx] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + movpic eax, POINTER [gotptr] ; load GOT address (eax) + alignx 16,7 +.columnloop: + + movq mm5, MMWORD [ebx] ; mm5=Cb(01234567) + movq mm1, MMWORD [edx] ; mm1=Cr(01234567) + + pcmpeqw mm4,mm4 + pcmpeqw mm7,mm7 + psrlw mm4,BYTE_BIT + psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80} + movq mm0,mm4 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..} + + pand mm4,mm5 ; mm4=Cb(0246)=CbE + psrlw mm5,BYTE_BIT ; mm5=Cb(1357)=CbO + pand mm0,mm1 ; mm0=Cr(0246)=CrE + psrlw mm1,BYTE_BIT ; mm1=Cr(1357)=CrO + + paddw mm4,mm7 + paddw mm5,mm7 + paddw mm0,mm7 + paddw mm1,mm7 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movq mm2,mm4 ; mm2=CbE + movq mm3,mm5 ; mm3=CbO + paddw mm4,mm4 ; mm4=2*CbE + paddw mm5,mm5 ; mm5=2*CbO + movq mm6,mm0 ; mm6=CrE + movq mm7,mm1 ; mm7=CrO + paddw mm0,mm0 ; mm0=2*CrE + paddw mm1,mm1 ; mm1=2*CrO + + pmulhw mm4,[GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbE * -FIX(0.22800)) + pmulhw mm5,[GOTOFF(eax,PW_MF0228)] ; mm5=(2*CbO * -FIX(0.22800)) + pmulhw mm0,[GOTOFF(eax,PW_F0402)] ; mm0=(2*CrE * FIX(0.40200)) + pmulhw mm1,[GOTOFF(eax,PW_F0402)] ; mm1=(2*CrO * FIX(0.40200)) + + paddw mm4,[GOTOFF(eax,PW_ONE)] + paddw mm5,[GOTOFF(eax,PW_ONE)] + psraw mm4,1 ; mm4=(CbE * -FIX(0.22800)) + psraw mm5,1 ; mm5=(CbO * -FIX(0.22800)) + paddw mm0,[GOTOFF(eax,PW_ONE)] + paddw mm1,[GOTOFF(eax,PW_ONE)] + psraw mm0,1 ; mm0=(CrE * FIX(0.40200)) + psraw mm1,1 ; mm1=(CrO * FIX(0.40200)) + + paddw mm4,mm2 + paddw mm5,mm3 + paddw mm4,mm2 ; mm4=(CbE * FIX(1.77200))=(B-Y)E + paddw mm5,mm3 ; mm5=(CbO * FIX(1.77200))=(B-Y)O + paddw mm0,mm6 ; mm0=(CrE * FIX(1.40200))=(R-Y)E + paddw mm1,mm7 ; mm1=(CrO * FIX(1.40200))=(R-Y)O + + movq MMWORD [wk(0)], mm4 ; wk(0)=(B-Y)E + movq MMWORD [wk(1)], mm5 ; wk(1)=(B-Y)O + + movq mm4,mm2 + movq mm5,mm3 + punpcklwd mm2,mm6 + punpckhwd mm4,mm6 + pmaddwd mm2,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd mm4,[GOTOFF(eax,PW_MF0344_F0285)] + punpcklwd mm3,mm7 + punpckhwd mm5,mm7 + pmaddwd mm3,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd mm5,[GOTOFF(eax,PW_MF0344_F0285)] + + paddd mm2,[GOTOFF(eax,PD_ONEHALF)] + paddd mm4,[GOTOFF(eax,PD_ONEHALF)] + psrad mm2,SCALEBITS + psrad mm4,SCALEBITS + paddd mm3,[GOTOFF(eax,PD_ONEHALF)] + paddd mm5,[GOTOFF(eax,PD_ONEHALF)] + psrad mm3,SCALEBITS + psrad mm5,SCALEBITS + + packssdw mm2,mm4 ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285) + packssdw mm3,mm5 ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285) + psubw mm2,mm6 ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E + psubw mm3,mm7 ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O + + movq mm5, MMWORD [esi] ; mm5=Y(01234567) + + pcmpeqw mm4,mm4 + psrlw mm4,BYTE_BIT ; mm4={0xFF 0x00 0xFF 0x00 ..} + pand mm4,mm5 ; mm4=Y(0246)=YE + psrlw mm5,BYTE_BIT ; mm5=Y(1357)=YO + + paddw mm0,mm4 ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6) + paddw mm1,mm5 ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7) + packuswb mm0,mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **) + packuswb mm1,mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **) + + paddw mm2,mm4 ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6) + paddw mm3,mm5 ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7) + packuswb mm2,mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **) + packuswb mm3,mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **) + + paddw mm4, MMWORD [wk(0)] ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6) + paddw mm5, MMWORD [wk(1)] ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7) + packuswb mm4,mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **) + packuswb mm5,mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) + ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) + ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) + ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **) + + punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) + punpcklbw mmE,mmB ; mmE=(20 01 22 03 24 05 26 07) + punpcklbw mmD,mmF ; mmD=(11 21 13 23 15 25 17 27) + + movq mmG,mmA + movq mmH,mmA + punpcklwd mmA,mmE ; mmA=(00 10 20 01 02 12 22 03) + punpckhwd mmG,mmE ; mmG=(04 14 24 05 06 16 26 07) + + psrlq mmH,2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --) + psrlq mmE,2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --) + + movq mmC,mmD + movq mmB,mmD + punpcklwd mmD,mmH ; mmD=(11 21 02 12 13 23 04 14) + punpckhwd mmC,mmH ; mmC=(15 25 06 16 17 27 -- --) + + psrlq mmB,2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --) + + movq mmF,mmE + punpcklwd mmE,mmB ; mmE=(22 03 13 23 24 05 15 25) + punpckhwd mmF,mmB ; mmF=(26 07 17 27 -- -- -- --) + + punpckldq mmA,mmD ; mmA=(00 10 20 01 11 21 02 12) + punpckldq mmE,mmG ; mmE=(22 03 13 23 04 14 24 05) + punpckldq mmC,mmF ; mmC=(15 25 06 16 26 07 17 27) + + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st16 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmE + movq MMWORD [edi+2*SIZEOF_MMWORD], mmC + + sub ecx, byte SIZEOF_MMWORD + jz short .nextrow + + add esi, byte SIZEOF_MMWORD ; inptr0 + add ebx, byte SIZEOF_MMWORD ; inptr1 + add edx, byte SIZEOF_MMWORD ; inptr2 + add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr + jmp near .columnloop + alignx 16,7 + +.column_st16: + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_MMWORD + jb short .column_st8 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmE + movq mmA,mmC + sub ecx, byte 2*SIZEOF_MMWORD + add edi, byte 2*SIZEOF_MMWORD + jmp short .column_st4 +.column_st8: + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st4 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq mmA,mmE + sub ecx, byte SIZEOF_MMWORD + add edi, byte SIZEOF_MMWORD +.column_st4: + movd eax,mmA + cmp ecx, byte SIZEOF_DWORD + jb short .column_st2 + mov DWORD [edi+0*SIZEOF_DWORD], eax + psrlq mmA,DWORD_BIT + movd eax,mmA + sub ecx, byte SIZEOF_DWORD + add edi, byte SIZEOF_DWORD +.column_st2: + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov WORD [edi+0*SIZEOF_WORD], ax + shr eax,WORD_BIT + sub ecx, byte SIZEOF_WORD + add edi, byte SIZEOF_WORD +.column_st1: + cmp ecx, byte SIZEOF_BYTE + jb short .nextrow + mov BYTE [edi+0*SIZEOF_BYTE], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + pcmpeqb mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) + pcmpeqb mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) +%else + pxor mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) + pxor mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) +%endif + ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) + ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) + ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) + ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) + + punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) + punpcklbw mmE,mmG ; mmE=(20 30 22 32 24 34 26 36) + punpcklbw mmB,mmD ; mmB=(01 11 03 13 05 15 07 17) + punpcklbw mmF,mmH ; mmF=(21 31 23 33 25 35 27 37) + + movq mmC,mmA + punpcklwd mmA,mmE ; mmA=(00 10 20 30 02 12 22 32) + punpckhwd mmC,mmE ; mmC=(04 14 24 34 06 16 26 36) + movq mmG,mmB + punpcklwd mmB,mmF ; mmB=(01 11 21 31 03 13 23 33) + punpckhwd mmG,mmF ; mmG=(05 15 25 35 07 17 27 37) + + movq mmD,mmA + punpckldq mmA,mmB ; mmA=(00 10 20 30 01 11 21 31) + punpckhdq mmD,mmB ; mmD=(02 12 22 32 03 13 23 33) + movq mmH,mmC + punpckldq mmC,mmG ; mmC=(04 14 24 34 05 15 25 35) + punpckhdq mmH,mmG ; mmH=(06 16 26 36 07 17 27 37) + + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st16 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmD + movq MMWORD [edi+2*SIZEOF_MMWORD], mmC + movq MMWORD [edi+3*SIZEOF_MMWORD], mmH + + sub ecx, byte SIZEOF_MMWORD + jz short .nextrow + + add esi, byte SIZEOF_MMWORD ; inptr0 + add ebx, byte SIZEOF_MMWORD ; inptr1 + add edx, byte SIZEOF_MMWORD ; inptr2 + add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr + jmp near .columnloop + alignx 16,7 + +.column_st16: + cmp ecx, byte SIZEOF_MMWORD/2 + jb short .column_st8 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmD + movq mmA,mmC + movq mmD,mmH + sub ecx, byte SIZEOF_MMWORD/2 + add edi, byte 2*SIZEOF_MMWORD +.column_st8: + cmp ecx, byte SIZEOF_MMWORD/4 + jb short .column_st4 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq mmA,mmD + sub ecx, byte SIZEOF_MMWORD/4 + add edi, byte 1*SIZEOF_MMWORD +.column_st4: + cmp ecx, byte SIZEOF_MMWORD/8 + jb short .nextrow + movd DWORD [edi+0*SIZEOF_DWORD], mmA + +%endif ; RGB_PIXELSIZE ; --------------- + + alignx 16,7 + +.nextrow: + pop ecx + pop esi + pop ebx + pop edx + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + add edi, byte SIZEOF_JSAMPROW ; output_buf + dec eax ; num_rows + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jdcolext-sse2-64.asm b/media/libjpeg/simd/jdcolext-sse2-64.asm new file mode 100644 index 000000000..4634066c4 --- /dev/null +++ b/media/libjpeg/simd/jdcolext-sse2-64.asm @@ -0,0 +1,440 @@ +; +; jdcolext.asm - colorspace conversion (64-bit SSE2) +; +; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2012, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width, +; JSAMPIMAGE input_buf, JDIMENSION input_row, +; JSAMPARRAY output_buf, int num_rows) +; + +; r10 = JDIMENSION out_width +; r11 = JSAMPIMAGE input_buf +; r12 = JDIMENSION input_row +; r13 = JSAMPARRAY output_buf +; r14 = int num_rows + +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + global EXTN(jsimd_ycc_rgb_convert_sse2) + +EXTN(jsimd_ycc_rgb_convert_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + push rbx + + mov ecx, r10d ; num_cols + test rcx,rcx + jz near .return + + push rcx + + mov rdi, r11 + mov ecx, r12d + mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] + lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] + lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rdi, r13 + mov eax, r14d + test rax,rax + jle near .return +.rowloop: + push rax + push rdi + push rdx + push rbx + push rsi + push rcx ; col + + mov rsi, JSAMPROW [rsi] ; inptr0 + mov rbx, JSAMPROW [rbx] ; inptr1 + mov rdx, JSAMPROW [rdx] ; inptr2 + mov rdi, JSAMPROW [rdi] ; outptr +.columnloop: + + movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF) + movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF) + + pcmpeqw xmm4,xmm4 + pcmpeqw xmm7,xmm7 + psrlw xmm4,BYTE_BIT + psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} + + pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE + psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO + pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE + psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO + + paddw xmm4,xmm7 + paddw xmm5,xmm7 + paddw xmm0,xmm7 + paddw xmm1,xmm7 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm2,xmm4 ; xmm2=CbE + movdqa xmm3,xmm5 ; xmm3=CbO + paddw xmm4,xmm4 ; xmm4=2*CbE + paddw xmm5,xmm5 ; xmm5=2*CbO + movdqa xmm6,xmm0 ; xmm6=CrE + movdqa xmm7,xmm1 ; xmm7=CrO + paddw xmm0,xmm0 ; xmm0=2*CrE + paddw xmm1,xmm1 ; xmm1=2*CrO + + pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800)) + pmulhw xmm5,[rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800)) + pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200)) + pmulhw xmm1,[rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200)) + + paddw xmm4,[rel PW_ONE] + paddw xmm5,[rel PW_ONE] + psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800)) + psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800)) + paddw xmm0,[rel PW_ONE] + paddw xmm1,[rel PW_ONE] + psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200)) + psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200)) + + paddw xmm4,xmm2 + paddw xmm5,xmm3 + paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E + paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O + paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E + paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O + + movdqa xmm4,xmm2 + movdqa xmm5,xmm3 + punpcklwd xmm2,xmm6 + punpckhwd xmm4,xmm6 + pmaddwd xmm2,[rel PW_MF0344_F0285] + pmaddwd xmm4,[rel PW_MF0344_F0285] + punpcklwd xmm3,xmm7 + punpckhwd xmm5,xmm7 + pmaddwd xmm3,[rel PW_MF0344_F0285] + pmaddwd xmm5,[rel PW_MF0344_F0285] + + paddd xmm2,[rel PD_ONEHALF] + paddd xmm4,[rel PD_ONEHALF] + psrad xmm2,SCALEBITS + psrad xmm4,SCALEBITS + paddd xmm3,[rel PD_ONEHALF] + paddd xmm5,[rel PD_ONEHALF] + psrad xmm3,SCALEBITS + psrad xmm5,SCALEBITS + + packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) + packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) + psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E + psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O + + movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF) + + pcmpeqw xmm4,xmm4 + psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} + pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE + psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO + + paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) + paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) + packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) + + paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) + paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) + packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) + + paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) + paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) + packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG,xmmA + movdqa xmmH,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC,xmmD + movdqa xmmB,xmmD + punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF,xmmE + punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB,xmmE + punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB,xmmF + punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test rdi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF +.out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_XMMWORD + jz near .nextrow + + add rsi, byte SIZEOF_XMMWORD ; inptr0 + add rbx, byte SIZEOF_XMMWORD ; inptr1 + add rdx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + +.column_st32: + lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE + cmp rcx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmF + sub rcx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 +.column_st16: + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st15 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub rcx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_MMWORD + jb short .column_st7 + movq XMM_MMWORD [rdi], xmmA + add rdi, byte SIZEOF_MMWORD + sub rcx, byte SIZEOF_MMWORD + psrldq xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_DWORD + jb short .column_st3 + movd XMM_DWORD [rdi], xmmA + add rdi, byte SIZEOF_DWORD + sub rcx, byte SIZEOF_DWORD + psrldq xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of rax to the output when it has enough + ; space. + movd eax, xmmA + cmp rcx, byte SIZEOF_WORD + jb short .column_st1 + mov WORD [rdi], ax + add rdi, byte SIZEOF_WORD + sub rcx, byte SIZEOF_WORD + shr rax, 16 +.column_st1: + ; Store the lower 1 byte of rax to the output when it has enough + ; space. + test rcx, rcx + jz short .nextrow + mov BYTE [rdi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) +%else + pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) +%endif + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG,xmmB + punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH,xmmC + punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test rdi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH +.out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_XMMWORD + jz near .nextrow + + add rsi, byte SIZEOF_XMMWORD ; inptr0 + add rbx, byte SIZEOF_XMMWORD ; inptr1 + add rdx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + +.column_st32: + cmp rcx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmC + movdqa xmmD,xmmH + sub rcx, byte SIZEOF_XMMWORD/2 +.column_st16: + cmp rcx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub rcx, byte SIZEOF_XMMWORD/4 +.column_st15: + ; Store two pixels (8 bytes) of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_XMMWORD/8 + jb short .column_st7 + movq MMWORD [rdi], xmmA + add rdi, byte SIZEOF_XMMWORD/8*4 + sub rcx, byte SIZEOF_XMMWORD/8 + psrldq xmmA, SIZEOF_XMMWORD/8*4 +.column_st7: + ; Store one pixel (4 bytes) of xmmA to the output when it has enough + ; space. + test rcx, rcx + jz short .nextrow + movd XMM_DWORD [rdi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + +.nextrow: + pop rcx + pop rsi + pop rbx + pop rdx + pop rdi + pop rax + + add rsi, byte SIZEOF_JSAMPROW + add rbx, byte SIZEOF_JSAMPROW + add rdx, byte SIZEOF_JSAMPROW + add rdi, byte SIZEOF_JSAMPROW ; output_buf + dec rax ; num_rows + jg near .rowloop + + sfence ; flush the write buffer + +.return: + pop rbx + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jdcolext-sse2.asm b/media/libjpeg/simd/jdcolext-sse2.asm new file mode 100644 index 000000000..682aef35f --- /dev/null +++ b/media/libjpeg/simd/jdcolext-sse2.asm @@ -0,0 +1,459 @@ +; +; jdcolext.asm - colorspace conversion (SSE2) +; +; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2012, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width, +; JSAMPIMAGE input_buf, JDIMENSION input_row, +; JSAMPARRAY output_buf, int num_rows) +; + +%define out_width(b) (b)+8 ; JDIMENSION out_width +%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf +%define input_row(b) (b)+16 ; JDIMENSION input_row +%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf +%define num_rows(b) (b)+24 ; int num_rows + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr + + align 16 + global EXTN(jsimd_ycc_rgb_convert_sse2) + +EXTN(jsimd_ycc_rgb_convert_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [out_width(eax)] ; num_cols + test ecx,ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [input_row(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov edi, JSAMPARRAY [output_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax,eax + jle near .return + alignx 16,7 +.rowloop: + push eax + push edi + push edx + push ebx + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr0 + mov ebx, JSAMPROW [ebx] ; inptr1 + mov edx, JSAMPROW [edx] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + movpic eax, POINTER [gotptr] ; load GOT address (eax) + alignx 16,7 +.columnloop: + + movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF) + movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF) + + pcmpeqw xmm4,xmm4 + pcmpeqw xmm7,xmm7 + psrlw xmm4,BYTE_BIT + psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} + + pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE + psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO + pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE + psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO + + paddw xmm4,xmm7 + paddw xmm5,xmm7 + paddw xmm0,xmm7 + paddw xmm1,xmm7 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm2,xmm4 ; xmm2=CbE + movdqa xmm3,xmm5 ; xmm3=CbO + paddw xmm4,xmm4 ; xmm4=2*CbE + paddw xmm5,xmm5 ; xmm5=2*CbO + movdqa xmm6,xmm0 ; xmm6=CrE + movdqa xmm7,xmm1 ; xmm7=CrO + paddw xmm0,xmm0 ; xmm0=2*CrE + paddw xmm1,xmm1 ; xmm1=2*CrO + + pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800)) + pmulhw xmm5,[GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800)) + pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200)) + pmulhw xmm1,[GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200)) + + paddw xmm4,[GOTOFF(eax,PW_ONE)] + paddw xmm5,[GOTOFF(eax,PW_ONE)] + psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800)) + psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800)) + paddw xmm0,[GOTOFF(eax,PW_ONE)] + paddw xmm1,[GOTOFF(eax,PW_ONE)] + psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200)) + psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200)) + + paddw xmm4,xmm2 + paddw xmm5,xmm3 + paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E + paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O + paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E + paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O + + movdqa xmm4,xmm2 + movdqa xmm5,xmm3 + punpcklwd xmm2,xmm6 + punpckhwd xmm4,xmm6 + pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm4,[GOTOFF(eax,PW_MF0344_F0285)] + punpcklwd xmm3,xmm7 + punpckhwd xmm5,xmm7 + pmaddwd xmm3,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)] + + paddd xmm2,[GOTOFF(eax,PD_ONEHALF)] + paddd xmm4,[GOTOFF(eax,PD_ONEHALF)] + psrad xmm2,SCALEBITS + psrad xmm4,SCALEBITS + paddd xmm3,[GOTOFF(eax,PD_ONEHALF)] + paddd xmm5,[GOTOFF(eax,PD_ONEHALF)] + psrad xmm3,SCALEBITS + psrad xmm5,SCALEBITS + + packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) + packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) + psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E + psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O + + movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF) + + pcmpeqw xmm4,xmm4 + psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} + pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE + psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO + + paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) + paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) + packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) + + paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) + paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) + packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) + + paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) + paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) + packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG,xmmA + movdqa xmmH,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC,xmmD + movdqa xmmB,xmmD + punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF,xmmE + punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB,xmmE + punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB,xmmF + punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF +.out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_XMMWORD + jz near .nextrow + + add esi, byte SIZEOF_XMMWORD ; inptr0 + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16,7 + +.column_st32: + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmF + sub ecx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 +.column_st16: + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st15 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub ecx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st7 + movq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_MMWORD + sub ecx, byte SIZEOF_MMWORD + psrldq xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_DWORD + jb short .column_st3 + movd XMM_DWORD [edi], xmmA + add edi, byte SIZEOF_DWORD + sub ecx, byte SIZEOF_DWORD + psrldq xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of eax to the output when it has enough + ; space. + movd eax, xmmA + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov WORD [edi], ax + add edi, byte SIZEOF_WORD + sub ecx, byte SIZEOF_WORD + shr eax, 16 +.column_st1: + ; Store the lower 1 byte of eax to the output when it has enough + ; space. + test ecx, ecx + jz short .nextrow + mov BYTE [edi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) +%else + pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) +%endif + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG,xmmB + punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH,xmmC + punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH +.out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_XMMWORD + jz near .nextrow + + add esi, byte SIZEOF_XMMWORD ; inptr0 + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16,7 + +.column_st32: + cmp ecx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmC + movdqa xmmD,xmmH + sub ecx, byte SIZEOF_XMMWORD/2 +.column_st16: + cmp ecx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub ecx, byte SIZEOF_XMMWORD/4 +.column_st15: + ; Store two pixels (8 bytes) of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_XMMWORD/8 + jb short .column_st7 + movq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_XMMWORD/8*4 + sub ecx, byte SIZEOF_XMMWORD/8 + psrldq xmmA, SIZEOF_XMMWORD/8*4 +.column_st7: + ; Store one pixel (4 bytes) of xmmA to the output when it has enough + ; space. + test ecx, ecx + jz short .nextrow + movd XMM_DWORD [edi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + + alignx 16,7 + +.nextrow: + pop ecx + pop esi + pop ebx + pop edx + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + add edi, byte SIZEOF_JSAMPROW ; output_buf + dec eax ; num_rows + jg near .rowloop + + sfence ; flush the write buffer + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jdcolor-altivec.c b/media/libjpeg/simd/jdcolor-altivec.c new file mode 100644 index 000000000..0dc4c427c --- /dev/null +++ b/media/libjpeg/simd/jdcolor-altivec.c @@ -0,0 +1,96 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* YCC --> RGB CONVERSION */ + +#include "jsimd_altivec.h" + + +#define F_0_344 22554 /* FIX(0.34414) */ +#define F_0_714 46802 /* FIX(0.71414) */ +#define F_1_402 91881 /* FIX(1.40200) */ +#define F_1_772 116130 /* FIX(1.77200) */ +#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */ +#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */ +#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */ + +#define SCALEBITS 16 +#define ONE_HALF (1 << (SCALEBITS - 1)) + +#define RGB_INDEX0 {0,1,8,2,3,10,4,5,12,6,7,14,16,17,24,18} +#define RGB_INDEX1 {3,10,4,5,12,6,7,14,16,17,24,18,19,26,20,21} +#define RGB_INDEX2 {12,6,7,14,16,17,24,18,19,26,20,21,28,22,23,30} +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE + +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgb_convert_altivec +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX0 +#undef RGB_INDEX1 +#undef RGB_INDEX2 +#undef jsimd_ycc_rgb_convert_altivec + +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define RGB_INDEX {0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15} +#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgbx_convert_altivec +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_ycc_rgb_convert_altivec + +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define RGB_INDEX0 {8,1,0,10,3,2,12,5,4,14,7,6,24,17,16,26} +#define RGB_INDEX1 {3,2,12,5,4,14,7,6,24,17,16,26,19,18,28,21} +#define RGB_INDEX2 {4,14,7,6,24,17,16,26,19,18,28,21,20,30,23,22} +#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgr_convert_altivec +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX0 +#undef RGB_INDEX1 +#undef RGB_INDEX2 +#undef jsimd_ycc_rgb_convert_altivec + +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define RGB_INDEX {8,1,0,9,10,3,2,11,12,5,4,13,14,7,6,15} +#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgrx_convert_altivec +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_ycc_rgb_convert_altivec + +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define RGB_INDEX {9,8,1,0,11,10,3,2,13,12,5,4,15,14,7,6} +#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxbgr_convert_altivec +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_ycc_rgb_convert_altivec + +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define RGB_INDEX {9,0,1,8,11,2,3,10,13,4,5,12,15,6,7,14} +#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxrgb_convert_altivec +#include "jdcolext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_ycc_rgb_convert_altivec diff --git a/media/libjpeg/simd/jdcolor-mmx.asm b/media/libjpeg/simd/jdcolor-mmx.asm new file mode 100644 index 000000000..4e58031dd --- /dev/null +++ b/media/libjpeg/simd/jdcolor-mmx.asm @@ -0,0 +1,119 @@ +; +; jdcolor.asm - colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_ycc_rgb_convert_mmx) + +EXTN(jconst_ycc_rgb_convert_mmx): + +PW_F0402 times 4 dw F_0_402 +PW_MF0228 times 4 dw -F_0_228 +PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285 +PW_ONE times 4 dw 1 +PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jdcolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgb_convert_mmx +%include "jdcolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgbx_convert_mmx +%include "jdcolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgr_convert_mmx +%include "jdcolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgrx_convert_mmx +%include "jdcolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxbgr_convert_mmx +%include "jdcolext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxrgb_convert_mmx +%include "jdcolext-mmx.asm" diff --git a/media/libjpeg/simd/jdcolor-sse2-64.asm b/media/libjpeg/simd/jdcolor-sse2-64.asm new file mode 100644 index 000000000..d2bf21000 --- /dev/null +++ b/media/libjpeg/simd/jdcolor-sse2-64.asm @@ -0,0 +1,119 @@ +; +; jdcolor.asm - colorspace conversion (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_ycc_rgb_convert_sse2) + +EXTN(jconst_ycc_rgb_convert_sse2): + +PW_F0402 times 8 dw F_0_402 +PW_MF0228 times 8 dw -F_0_228 +PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 +PW_ONE times 8 dw 1 +PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jdcolext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2 +%include "jdcolext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2 +%include "jdcolext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2 +%include "jdcolext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2 +%include "jdcolext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2 +%include "jdcolext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2 +%include "jdcolext-sse2-64.asm" diff --git a/media/libjpeg/simd/jdcolor-sse2.asm b/media/libjpeg/simd/jdcolor-sse2.asm new file mode 100644 index 000000000..7ff5d05d0 --- /dev/null +++ b/media/libjpeg/simd/jdcolor-sse2.asm @@ -0,0 +1,119 @@ +; +; jdcolor.asm - colorspace conversion (SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_ycc_rgb_convert_sse2) + +EXTN(jconst_ycc_rgb_convert_sse2): + +PW_F0402 times 8 dw F_0_402 +PW_MF0228 times 8 dw -F_0_228 +PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 +PW_ONE times 8 dw 1 +PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2 +%include "jdcolext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2 +%include "jdcolext-sse2.asm" diff --git a/media/libjpeg/simd/jdct.inc b/media/libjpeg/simd/jdct.inc new file mode 100644 index 000000000..b9761071e --- /dev/null +++ b/media/libjpeg/simd/jdct.inc @@ -0,0 +1,27 @@ +; +; jdct.inc - private declarations for forward & reverse DCT subsystems +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; [TAB8] + +; Each IDCT routine is responsible for range-limiting its results and +; converting them to unsigned form (0..MAXJSAMPLE). The raw outputs could +; be quite far out of range if the input data is corrupt, so a bulletproof +; range-limiting step is required. We use a mask-and-table-lookup method +; to do the combined operations quickly. +; +%define RANGE_MASK (MAXJSAMPLE * 4 + 3) ; 2 bits wider than legal samples + +%define ROW(n,b,s) ((b)+(n)*(s)) +%define COL(n,b,s) ((b)+(n)*(s)*DCTSIZE) + +%define DWBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD) +%define MMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD) +%define XMMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD) + +; -------------------------------------------------------------------------- diff --git a/media/libjpeg/simd/jdmerge-altivec.c b/media/libjpeg/simd/jdmerge-altivec.c new file mode 100644 index 000000000..6a35f2019 --- /dev/null +++ b/media/libjpeg/simd/jdmerge-altivec.c @@ -0,0 +1,108 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* MERGED YCC --> RGB CONVERSION AND UPSAMPLING */ + +#include "jsimd_altivec.h" + + +#define F_0_344 22554 /* FIX(0.34414) */ +#define F_0_714 46802 /* FIX(0.71414) */ +#define F_1_402 91881 /* FIX(1.40200) */ +#define F_1_772 116130 /* FIX(1.77200) */ +#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */ +#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */ +#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */ + +#define SCALEBITS 16 +#define ONE_HALF (1 << (SCALEBITS - 1)) + +#define RGB_INDEX0 {0,1,8,2,3,10,4,5,12,6,7,14,16,17,24,18} +#define RGB_INDEX1 {3,10,4,5,12,6,7,14,16,17,24,18,19,26,20,21} +#define RGB_INDEX2 {12,6,7,14,16,17,24,18,19,26,20,21,28,22,23,30} +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE + +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extrgb_merged_upsample_altivec +#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extrgb_merged_upsample_altivec +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX0 +#undef RGB_INDEX1 +#undef RGB_INDEX2 +#undef jsimd_h2v1_merged_upsample_altivec +#undef jsimd_h2v2_merged_upsample_altivec + +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define RGB_INDEX {0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15} +#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extrgbx_merged_upsample_altivec +#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extrgbx_merged_upsample_altivec +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_h2v1_merged_upsample_altivec +#undef jsimd_h2v2_merged_upsample_altivec + +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define RGB_INDEX0 {8,1,0,10,3,2,12,5,4,14,7,6,24,17,16,26} +#define RGB_INDEX1 {3,2,12,5,4,14,7,6,24,17,16,26,19,18,28,21} +#define RGB_INDEX2 {4,14,7,6,24,17,16,26,19,18,28,21,20,30,23,22} +#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extbgr_merged_upsample_altivec +#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extbgr_merged_upsample_altivec +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX0 +#undef RGB_INDEX1 +#undef RGB_INDEX2 +#undef jsimd_h2v1_merged_upsample_altivec +#undef jsimd_h2v2_merged_upsample_altivec + +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define RGB_INDEX {8,1,0,9,10,3,2,11,12,5,4,13,14,7,6,15} +#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extbgrx_merged_upsample_altivec +#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extbgrx_merged_upsample_altivec +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_h2v1_merged_upsample_altivec +#undef jsimd_h2v2_merged_upsample_altivec + +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define RGB_INDEX {9,8,1,0,11,10,3,2,13,12,5,4,15,14,7,6} +#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extxbgr_merged_upsample_altivec +#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extxbgr_merged_upsample_altivec +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_h2v1_merged_upsample_altivec +#undef jsimd_h2v2_merged_upsample_altivec + +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define RGB_INDEX {9,0,1,8,11,2,3,10,13,4,5,12,15,6,7,14} +#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extxrgb_merged_upsample_altivec +#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extxrgb_merged_upsample_altivec +#include "jdmrgext-altivec.c" +#undef RGB_PIXELSIZE +#undef RGB_INDEX +#undef jsimd_h2v1_merged_upsample_altivec +#undef jsimd_h2v2_merged_upsample_altivec diff --git a/media/libjpeg/simd/jdmerge-mmx.asm b/media/libjpeg/simd/jdmerge-mmx.asm new file mode 100644 index 000000000..ee58bff1c --- /dev/null +++ b/media/libjpeg/simd/jdmerge-mmx.asm @@ -0,0 +1,125 @@ +; +; jdmerge.asm - merged upsampling/color conversion (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_merged_upsample_mmx) + +EXTN(jconst_merged_upsample_mmx): + +PW_F0402 times 4 dw F_0_402 +PW_MF0228 times 4 dw -F_0_228 +PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285 +PW_ONE times 4 dw 1 +PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jdmrgext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgb_merged_upsample_mmx +%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgb_merged_upsample_mmx +%include "jdmrgext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgbx_merged_upsample_mmx +%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgbx_merged_upsample_mmx +%include "jdmrgext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgr_merged_upsample_mmx +%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgr_merged_upsample_mmx +%include "jdmrgext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgrx_merged_upsample_mmx +%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgrx_merged_upsample_mmx +%include "jdmrgext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxbgr_merged_upsample_mmx +%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxbgr_merged_upsample_mmx +%include "jdmrgext-mmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxrgb_merged_upsample_mmx +%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxrgb_merged_upsample_mmx +%include "jdmrgext-mmx.asm" diff --git a/media/libjpeg/simd/jdmerge-sse2-64.asm b/media/libjpeg/simd/jdmerge-sse2-64.asm new file mode 100644 index 000000000..244bd4023 --- /dev/null +++ b/media/libjpeg/simd/jdmerge-sse2-64.asm @@ -0,0 +1,125 @@ +; +; jdmerge.asm - merged upsampling/color conversion (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_merged_upsample_sse2) + +EXTN(jconst_merged_upsample_sse2): + +PW_F0402 times 8 dw F_0_402 +PW_MF0228 times 8 dw -F_0_228 +PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 +PW_ONE times 8 dw 1 +PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jdmrgext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2 +%include "jdmrgext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2 +%include "jdmrgext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2 +%include "jdmrgext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2 +%include "jdmrgext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2 +%include "jdmrgext-sse2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2 +%include "jdmrgext-sse2-64.asm" diff --git a/media/libjpeg/simd/jdmerge-sse2.asm b/media/libjpeg/simd/jdmerge-sse2.asm new file mode 100644 index 000000000..236de5a38 --- /dev/null +++ b/media/libjpeg/simd/jdmerge-sse2.asm @@ -0,0 +1,125 @@ +; +; jdmerge.asm - merged upsampling/color conversion (SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_merged_upsample_sse2) + +EXTN(jconst_merged_upsample_sse2): + +PW_F0402 times 8 dw F_0_402 +PW_MF0228 times 8 dw -F_0_228 +PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 +PW_ONE times 8 dw 1 +PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2 +%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2 +%include "jdmrgext-sse2.asm" diff --git a/media/libjpeg/simd/jdmrgext-altivec.c b/media/libjpeg/simd/jdmrgext-altivec.c new file mode 100644 index 000000000..55205bb1f --- /dev/null +++ b/media/libjpeg/simd/jdmrgext-altivec.c @@ -0,0 +1,323 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jdmerge-altivec.c */ + + +void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + JSAMPROW outptr, inptr0, inptr1, inptr2; + int pitch = output_width * RGB_PIXELSIZE, num_cols, yloop; +#if __BIG_ENDIAN__ + int offset; +#endif + unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; + + __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3, + y, cb, cr; +#if __BIG_ENDIAN__ + __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3; +#if RGB_PIXELSIZE == 4 + __vector unsigned char out4; +#endif +#endif +#if RGB_PIXELSIZE == 4 + __vector unsigned char rgb3; +#endif + __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, ye, yo, cbl, cbh, + crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w, + rl, rh, gl, gh, bl, bh, re, ro, ge, go, be, bo; + __vector int g_y0, g_y1, g_y2, g_y3; + + /* Constants + * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17 + * high-order bits, not 16. + */ + __vector short pw_f0402 = { __8X(F_0_402 >> 1) }, + pw_mf0228 = { __8X(-F_0_228 >> 1) }, + pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) }, + pw_one = { __8X(1) }, pw_255 = { __8X(255) }, + pw_cj = { __8X(CENTERJSAMPLE) }; + __vector int pd_onehalf = { __4X(ONE_HALF) }; + __vector unsigned char pb_zero = { __16X(0) }, +#if __BIG_ENDIAN__ + shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29}, + even_index = {0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30}, + odd_index = {0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31}; +#else + shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31}, + even_index = {16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0}, + odd_index = {17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0}; +#endif + + inptr0 = input_buf[0][in_row_group_ctr]; + inptr1 = input_buf[1][in_row_group_ctr]; + inptr2 = input_buf[2][in_row_group_ctr]; + outptr = output_buf[0]; + + for (num_cols = pitch; num_cols > 0; inptr1 += 16, inptr2 += 16) { + + cb = vec_ld(0, inptr1); + /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't + * support unsigned vectors. + */ + cbl = (__vector signed short)VEC_UNPACKHU(cb); + cbh = (__vector signed short)VEC_UNPACKLU(cb); + cbl = vec_sub(cbl, pw_cj); + cbh = vec_sub(cbh, pw_cj); + + cr = vec_ld(0, inptr2); + crl = (__vector signed short)VEC_UNPACKHU(cr); + crh = (__vector signed short)VEC_UNPACKLU(cr); + crl = vec_sub(crl, pw_cj); + crh = vec_sub(crh, pw_cj); + + /* (Original) + * R = Y + 1.40200 * Cr + * G = Y - 0.34414 * Cb - 0.71414 * Cr + * B = Y + 1.77200 * Cb + * + * (This implementation) + * R = Y + 0.40200 * Cr + Cr + * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + * B = Y - 0.22800 * Cb + Cb + Cb + */ + b_yl = vec_add(cbl, cbl); + b_yh = vec_add(cbh, cbh); + b_yl = vec_madds(b_yl, pw_mf0228, pw_one); + b_yh = vec_madds(b_yh, pw_mf0228, pw_one); + b_yl = vec_sra(b_yl, (__vector unsigned short)pw_one); + b_yh = vec_sra(b_yh, (__vector unsigned short)pw_one); + b_yl = vec_add(b_yl, cbl); + b_yh = vec_add(b_yh, cbh); + b_yl = vec_add(b_yl, cbl); + b_yh = vec_add(b_yh, cbh); + + r_yl = vec_add(crl, crl); + r_yh = vec_add(crh, crh); + r_yl = vec_madds(r_yl, pw_f0402, pw_one); + r_yh = vec_madds(r_yh, pw_f0402, pw_one); + r_yl = vec_sra(r_yl, (__vector unsigned short)pw_one); + r_yh = vec_sra(r_yh, (__vector unsigned short)pw_one); + r_yl = vec_add(r_yl, crl); + r_yh = vec_add(r_yh, crh); + + g_y0w = vec_mergeh(cbl, crl); + g_y1w = vec_mergel(cbl, crl); + g_y0 = vec_msums(g_y0w, pw_mf0344_f0285, pd_onehalf); + g_y1 = vec_msums(g_y1w, pw_mf0344_f0285, pd_onehalf); + g_y2w = vec_mergeh(cbh, crh); + g_y3w = vec_mergel(cbh, crh); + g_y2 = vec_msums(g_y2w, pw_mf0344_f0285, pd_onehalf); + g_y3 = vec_msums(g_y3w, pw_mf0344_f0285, pd_onehalf); + /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from + * each dword into a new 16-bit vector, which is the equivalent of + * descaling the 32-bit results (right-shifting by 16 bits) and then + * packing them. + */ + g_yl = vec_perm((__vector short)g_y0, (__vector short)g_y1, + shift_pack_index); + g_yh = vec_perm((__vector short)g_y2, (__vector short)g_y3, + shift_pack_index); + g_yl = vec_sub(g_yl, crl); + g_yh = vec_sub(g_yh, crh); + + for (yloop = 0; yloop < 2 && num_cols > 0; yloop++, + num_cols -= RGB_PIXELSIZE * 16, + outptr += RGB_PIXELSIZE * 16, inptr0 += 16) { + + y = vec_ld(0, inptr0); + ye = (__vector signed short)vec_perm(pb_zero, y, even_index); + yo = (__vector signed short)vec_perm(pb_zero, y, odd_index); + + if (yloop == 0) { + be = vec_add(b_yl, ye); + bo = vec_add(b_yl, yo); + re = vec_add(r_yl, ye); + ro = vec_add(r_yl, yo); + ge = vec_add(g_yl, ye); + go = vec_add(g_yl, yo); + } else { + be = vec_add(b_yh, ye); + bo = vec_add(b_yh, yo); + re = vec_add(r_yh, ye); + ro = vec_add(r_yh, yo); + ge = vec_add(g_yh, ye); + go = vec_add(g_yh, yo); + } + + rl = vec_mergeh(re, ro); + rh = vec_mergel(re, ro); + gl = vec_mergeh(ge, go); + gh = vec_mergel(ge, go); + bl = vec_mergeh(be, bo); + bh = vec_mergel(be, bo); + + rg0 = vec_mergeh(rl, gl); + bx0 = vec_mergeh(bl, pw_255); + rg1 = vec_mergel(rl, gl); + bx1 = vec_mergel(bl, pw_255); + rg2 = vec_mergeh(rh, gh); + bx2 = vec_mergeh(bh, pw_255); + rg3 = vec_mergel(rh, gh); + bx3 = vec_mergel(bh, pw_255); + + rgbx0 = vec_packsu(rg0, bx0); + rgbx1 = vec_packsu(rg1, bx1); + rgbx2 = vec_packsu(rg2, bx2); + rgbx3 = vec_packsu(rg3, bx3); + +#if RGB_PIXELSIZE == 3 + /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3 + * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7 + * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb + * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf + * + * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 + * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga + * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf + */ + rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0); + rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1); + rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2); +#else + /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3 + * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7 + * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb + * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf + * + * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3 + * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7 + * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb + * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf + */ + rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX); + rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX); + rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX); + rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX); +#endif + +#if __BIG_ENDIAN__ + offset = (size_t)outptr & 15; + if (offset) { + __vector unsigned char unaligned_shift_index; + int bytes = num_cols + offset; + + if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) { + /* Slow path to prevent buffer overwrite. Since there is no way to + * write a partial AltiVec register, overwrite would occur on the + * last chunk of the last image row if the right edge is not on a + * 16-byte boundary. It could also occur on other rows if the bytes + * per row is low enough. Since we can't determine whether we're on + * the last image row, we have to assume every row is the last. + */ + vec_st(rgb0, 0, tmpbuf); + vec_st(rgb1, 16, tmpbuf); + vec_st(rgb2, 32, tmpbuf); +#if RGB_PIXELSIZE == 4 + vec_st(rgb3, 48, tmpbuf); +#endif + memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16)); + } else { + /* Fast path */ + unaligned_shift_index = vec_lvsl(0, outptr); + edgel = vec_ld(0, outptr); + edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr); + edges = vec_perm(edgeh, edgel, unaligned_shift_index); + unaligned_shift_index = vec_lvsr(0, outptr); + out0 = vec_perm(edges, rgb0, unaligned_shift_index); + out1 = vec_perm(rgb0, rgb1, unaligned_shift_index); + out2 = vec_perm(rgb1, rgb2, unaligned_shift_index); +#if RGB_PIXELSIZE == 4 + out3 = vec_perm(rgb2, rgb3, unaligned_shift_index); + out4 = vec_perm(rgb3, edges, unaligned_shift_index); +#else + out3 = vec_perm(rgb2, edges, unaligned_shift_index); +#endif + vec_st(out0, 0, outptr); + if (bytes > 16) + vec_st(out1, 16, outptr); + if (bytes > 32) + vec_st(out2, 32, outptr); + if (bytes > 48) + vec_st(out3, 48, outptr); +#if RGB_PIXELSIZE == 4 + if (bytes > 64) + vec_st(out4, 64, outptr); +#endif + } + } else { +#endif /* __BIG_ENDIAN__ */ + if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { + /* Slow path */ + VEC_ST(rgb0, 0, tmpbuf); + VEC_ST(rgb1, 16, tmpbuf); + VEC_ST(rgb2, 32, tmpbuf); +#if RGB_PIXELSIZE == 4 + VEC_ST(rgb3, 48, tmpbuf); +#endif + memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16)); + } else { + /* Fast path */ + VEC_ST(rgb0, 0, outptr); + if (num_cols > 16) + VEC_ST(rgb1, 16, outptr); + if (num_cols > 32) + VEC_ST(rgb2, 32, outptr); +#if RGB_PIXELSIZE == 4 + if (num_cols > 48) + VEC_ST(rgb3, 48, outptr); +#endif + } +#if __BIG_ENDIAN__ + } +#endif + } + } +} + + +void jsimd_h2v2_merged_upsample_altivec (JDIMENSION output_width, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + JSAMPROW inptr, outptr; + + inptr = input_buf[0][in_row_group_ctr]; + outptr = output_buf[0]; + + input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2]; + jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr, + output_buf); + + input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1]; + output_buf[0] = output_buf[1]; + jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr, + output_buf); + + input_buf[0][in_row_group_ctr] = inptr; + output_buf[0] = outptr; +} diff --git a/media/libjpeg/simd/jdmrgext-mmx.asm b/media/libjpeg/simd/jdmrgext-mmx.asm new file mode 100644 index 000000000..63f45cf37 --- /dev/null +++ b/media/libjpeg/simd/jdmrgext-mmx.asm @@ -0,0 +1,463 @@ +; +; jdmrgext.asm - merged upsampling/color conversion (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v1_merged_upsample_mmx (JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +%define output_width(b) (b)+8 ; JDIMENSION output_width +%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 3 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr + + align 16 + global EXTN(jsimd_h2v1_merged_upsample_mmx) + +EXTN(jsimd_h2v1_merged_upsample_mmx): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [output_width(eax)] ; col + test ecx,ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [in_row_group_ctr(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(eax)] + mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0 + mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1 + mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + + pop ecx ; col + + alignx 16,7 +.columnloop: + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + movq mm6, MMWORD [ebx] ; mm6=Cb(01234567) + movq mm7, MMWORD [edx] ; mm7=Cr(01234567) + + pxor mm1,mm1 ; mm1=(all 0's) + pcmpeqw mm3,mm3 + psllw mm3,7 ; mm3={0xFF80 0xFF80 0xFF80 0xFF80} + + movq mm4,mm6 + punpckhbw mm6,mm1 ; mm6=Cb(4567)=CbH + punpcklbw mm4,mm1 ; mm4=Cb(0123)=CbL + movq mm0,mm7 + punpckhbw mm7,mm1 ; mm7=Cr(4567)=CrH + punpcklbw mm0,mm1 ; mm0=Cr(0123)=CrL + + paddw mm6,mm3 + paddw mm4,mm3 + paddw mm7,mm3 + paddw mm0,mm3 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movq mm5,mm6 ; mm5=CbH + movq mm2,mm4 ; mm2=CbL + paddw mm6,mm6 ; mm6=2*CbH + paddw mm4,mm4 ; mm4=2*CbL + movq mm1,mm7 ; mm1=CrH + movq mm3,mm0 ; mm3=CrL + paddw mm7,mm7 ; mm7=2*CrH + paddw mm0,mm0 ; mm0=2*CrL + + pmulhw mm6,[GOTOFF(eax,PW_MF0228)] ; mm6=(2*CbH * -FIX(0.22800)) + pmulhw mm4,[GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbL * -FIX(0.22800)) + pmulhw mm7,[GOTOFF(eax,PW_F0402)] ; mm7=(2*CrH * FIX(0.40200)) + pmulhw mm0,[GOTOFF(eax,PW_F0402)] ; mm0=(2*CrL * FIX(0.40200)) + + paddw mm6,[GOTOFF(eax,PW_ONE)] + paddw mm4,[GOTOFF(eax,PW_ONE)] + psraw mm6,1 ; mm6=(CbH * -FIX(0.22800)) + psraw mm4,1 ; mm4=(CbL * -FIX(0.22800)) + paddw mm7,[GOTOFF(eax,PW_ONE)] + paddw mm0,[GOTOFF(eax,PW_ONE)] + psraw mm7,1 ; mm7=(CrH * FIX(0.40200)) + psraw mm0,1 ; mm0=(CrL * FIX(0.40200)) + + paddw mm6,mm5 + paddw mm4,mm2 + paddw mm6,mm5 ; mm6=(CbH * FIX(1.77200))=(B-Y)H + paddw mm4,mm2 ; mm4=(CbL * FIX(1.77200))=(B-Y)L + paddw mm7,mm1 ; mm7=(CrH * FIX(1.40200))=(R-Y)H + paddw mm0,mm3 ; mm0=(CrL * FIX(1.40200))=(R-Y)L + + movq MMWORD [wk(0)], mm6 ; wk(0)=(B-Y)H + movq MMWORD [wk(1)], mm7 ; wk(1)=(R-Y)H + + movq mm6,mm5 + movq mm7,mm2 + punpcklwd mm5,mm1 + punpckhwd mm6,mm1 + pmaddwd mm5,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd mm6,[GOTOFF(eax,PW_MF0344_F0285)] + punpcklwd mm2,mm3 + punpckhwd mm7,mm3 + pmaddwd mm2,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd mm7,[GOTOFF(eax,PW_MF0344_F0285)] + + paddd mm5,[GOTOFF(eax,PD_ONEHALF)] + paddd mm6,[GOTOFF(eax,PD_ONEHALF)] + psrad mm5,SCALEBITS + psrad mm6,SCALEBITS + paddd mm2,[GOTOFF(eax,PD_ONEHALF)] + paddd mm7,[GOTOFF(eax,PD_ONEHALF)] + psrad mm2,SCALEBITS + psrad mm7,SCALEBITS + + packssdw mm5,mm6 ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285) + packssdw mm2,mm7 ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285) + psubw mm5,mm1 ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H + psubw mm2,mm3 ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L + + movq MMWORD [wk(2)], mm5 ; wk(2)=(G-Y)H + + mov al,2 ; Yctr + jmp short .Yloop_1st + alignx 16,7 + +.Yloop_2nd: + movq mm0, MMWORD [wk(1)] ; mm0=(R-Y)H + movq mm2, MMWORD [wk(2)] ; mm2=(G-Y)H + movq mm4, MMWORD [wk(0)] ; mm4=(B-Y)H + alignx 16,7 + +.Yloop_1st: + movq mm7, MMWORD [esi] ; mm7=Y(01234567) + + pcmpeqw mm6,mm6 + psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} + pand mm6,mm7 ; mm6=Y(0246)=YE + psrlw mm7,BYTE_BIT ; mm7=Y(1357)=YO + + movq mm1,mm0 ; mm1=mm0=(R-Y)(L/H) + movq mm3,mm2 ; mm3=mm2=(G-Y)(L/H) + movq mm5,mm4 ; mm5=mm4=(B-Y)(L/H) + + paddw mm0,mm6 ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6) + paddw mm1,mm7 ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7) + packuswb mm0,mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **) + packuswb mm1,mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **) + + paddw mm2,mm6 ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6) + paddw mm3,mm7 ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7) + packuswb mm2,mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **) + packuswb mm3,mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **) + + paddw mm4,mm6 ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6) + paddw mm5,mm7 ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7) + packuswb mm4,mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **) + packuswb mm5,mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) + ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) + ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) + ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **) + + punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) + punpcklbw mmE,mmB ; mmE=(20 01 22 03 24 05 26 07) + punpcklbw mmD,mmF ; mmD=(11 21 13 23 15 25 17 27) + + movq mmG,mmA + movq mmH,mmA + punpcklwd mmA,mmE ; mmA=(00 10 20 01 02 12 22 03) + punpckhwd mmG,mmE ; mmG=(04 14 24 05 06 16 26 07) + + psrlq mmH,2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --) + psrlq mmE,2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --) + + movq mmC,mmD + movq mmB,mmD + punpcklwd mmD,mmH ; mmD=(11 21 02 12 13 23 04 14) + punpckhwd mmC,mmH ; mmC=(15 25 06 16 17 27 -- --) + + psrlq mmB,2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --) + + movq mmF,mmE + punpcklwd mmE,mmB ; mmE=(22 03 13 23 24 05 15 25) + punpckhwd mmF,mmB ; mmF=(26 07 17 27 -- -- -- --) + + punpckldq mmA,mmD ; mmA=(00 10 20 01 11 21 02 12) + punpckldq mmE,mmG ; mmE=(22 03 13 23 04 14 24 05) + punpckldq mmC,mmF ; mmC=(15 25 06 16 26 07 17 27) + + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st16 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmE + movq MMWORD [edi+2*SIZEOF_MMWORD], mmC + + sub ecx, byte SIZEOF_MMWORD + jz near .endcolumn + + add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr + add esi, byte SIZEOF_MMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add ebx, byte SIZEOF_MMWORD ; inptr1 + add edx, byte SIZEOF_MMWORD ; inptr2 + jmp near .columnloop + alignx 16,7 + +.column_st16: + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_MMWORD + jb short .column_st8 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmE + movq mmA,mmC + sub ecx, byte 2*SIZEOF_MMWORD + add edi, byte 2*SIZEOF_MMWORD + jmp short .column_st4 +.column_st8: + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st4 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq mmA,mmE + sub ecx, byte SIZEOF_MMWORD + add edi, byte SIZEOF_MMWORD +.column_st4: + movd eax,mmA + cmp ecx, byte SIZEOF_DWORD + jb short .column_st2 + mov DWORD [edi+0*SIZEOF_DWORD], eax + psrlq mmA,DWORD_BIT + movd eax,mmA + sub ecx, byte SIZEOF_DWORD + add edi, byte SIZEOF_DWORD +.column_st2: + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov WORD [edi+0*SIZEOF_WORD], ax + shr eax,WORD_BIT + sub ecx, byte SIZEOF_WORD + add edi, byte SIZEOF_WORD +.column_st1: + cmp ecx, byte SIZEOF_BYTE + jb short .endcolumn + mov BYTE [edi+0*SIZEOF_BYTE], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + pcmpeqb mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) + pcmpeqb mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) +%else + pxor mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) + pxor mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) +%endif + ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) + ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) + ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) + ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) + + punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) + punpcklbw mmE,mmG ; mmE=(20 30 22 32 24 34 26 36) + punpcklbw mmB,mmD ; mmB=(01 11 03 13 05 15 07 17) + punpcklbw mmF,mmH ; mmF=(21 31 23 33 25 35 27 37) + + movq mmC,mmA + punpcklwd mmA,mmE ; mmA=(00 10 20 30 02 12 22 32) + punpckhwd mmC,mmE ; mmC=(04 14 24 34 06 16 26 36) + movq mmG,mmB + punpcklwd mmB,mmF ; mmB=(01 11 21 31 03 13 23 33) + punpckhwd mmG,mmF ; mmG=(05 15 25 35 07 17 27 37) + + movq mmD,mmA + punpckldq mmA,mmB ; mmA=(00 10 20 30 01 11 21 31) + punpckhdq mmD,mmB ; mmD=(02 12 22 32 03 13 23 33) + movq mmH,mmC + punpckldq mmC,mmG ; mmC=(04 14 24 34 05 15 25 35) + punpckhdq mmH,mmG ; mmH=(06 16 26 36 07 17 27 37) + + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st16 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmD + movq MMWORD [edi+2*SIZEOF_MMWORD], mmC + movq MMWORD [edi+3*SIZEOF_MMWORD], mmH + + sub ecx, byte SIZEOF_MMWORD + jz short .endcolumn + + add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr + add esi, byte SIZEOF_MMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add ebx, byte SIZEOF_MMWORD ; inptr1 + add edx, byte SIZEOF_MMWORD ; inptr2 + jmp near .columnloop + alignx 16,7 + +.column_st16: + cmp ecx, byte SIZEOF_MMWORD/2 + jb short .column_st8 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmD + movq mmA,mmC + movq mmD,mmH + sub ecx, byte SIZEOF_MMWORD/2 + add edi, byte 2*SIZEOF_MMWORD +.column_st8: + cmp ecx, byte SIZEOF_MMWORD/4 + jb short .column_st4 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq mmA,mmD + sub ecx, byte SIZEOF_MMWORD/4 + add edi, byte 1*SIZEOF_MMWORD +.column_st4: + cmp ecx, byte SIZEOF_MMWORD/8 + jb short .endcolumn + movd DWORD [edi+0*SIZEOF_DWORD], mmA + +%endif ; RGB_PIXELSIZE ; --------------- + +.endcolumn: + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v2_merged_upsample_mmx (JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +%define output_width(b) (b)+8 ; JDIMENSION output_width +%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf + + align 16 + global EXTN(jsimd_h2v2_merged_upsample_mmx) + +EXTN(jsimd_h2v2_merged_upsample_mmx): + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov eax, JDIMENSION [output_width(ebp)] + + mov edi, JSAMPIMAGE [input_buf(ebp)] + mov ecx, JDIMENSION [in_row_group_ctr(ebp)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(ebp)] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + + push edx ; inptr2 + push ebx ; inptr1 + push esi ; inptr00 + mov ebx,esp + + push edi ; output_buf (outptr0) + push ecx ; in_row_group_ctr + push ebx ; input_buf + push eax ; output_width + + call near EXTN(jsimd_h2v1_merged_upsample_mmx) + + add esi, byte SIZEOF_JSAMPROW ; inptr01 + add edi, byte SIZEOF_JSAMPROW ; outptr1 + mov POINTER [ebx+0*SIZEOF_POINTER], esi + mov POINTER [ebx-1*SIZEOF_POINTER], edi + + call near EXTN(jsimd_h2v1_merged_upsample_mmx) + + add esp, byte 7*SIZEOF_DWORD + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jdmrgext-sse2-64.asm b/media/libjpeg/simd/jdmrgext-sse2-64.asm new file mode 100644 index 000000000..ad74c5ff4 --- /dev/null +++ b/media/libjpeg/simd/jdmrgext-sse2-64.asm @@ -0,0 +1,537 @@ +; +; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2) +; +; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, 2012, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +; r10 = JDIMENSION output_width +; r11 = JSAMPIMAGE input_buf +; r12 = JDIMENSION in_row_group_ctr +; r13 = JSAMPARRAY output_buf + +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 3 + + align 16 + global EXTN(jsimd_h2v1_merged_upsample_sse2) + +EXTN(jsimd_h2v1_merged_upsample_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + push rbx + + mov ecx, r10d ; col + test rcx,rcx + jz near .return + + push rcx + + mov rdi, r11 + mov ecx, r12d + mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + mov rdi, r13 + mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0 + mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1 + mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2 + mov rdi, JSAMPROW [rdi] ; outptr + + pop rcx ; col + +.columnloop: + + movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF) + movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF) + + pxor xmm1,xmm1 ; xmm1=(all 0's) + pcmpeqw xmm3,xmm3 + psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + movdqa xmm4,xmm6 + punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH + punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL + movdqa xmm0,xmm7 + punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH + punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL + + paddw xmm6,xmm3 + paddw xmm4,xmm3 + paddw xmm7,xmm3 + paddw xmm0,xmm3 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm5,xmm6 ; xmm5=CbH + movdqa xmm2,xmm4 ; xmm2=CbL + paddw xmm6,xmm6 ; xmm6=2*CbH + paddw xmm4,xmm4 ; xmm4=2*CbL + movdqa xmm1,xmm7 ; xmm1=CrH + movdqa xmm3,xmm0 ; xmm3=CrL + paddw xmm7,xmm7 ; xmm7=2*CrH + paddw xmm0,xmm0 ; xmm0=2*CrL + + pmulhw xmm6,[rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800)) + pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800)) + pmulhw xmm7,[rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200)) + pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200)) + + paddw xmm6,[rel PW_ONE] + paddw xmm4,[rel PW_ONE] + psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800)) + psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800)) + paddw xmm7,[rel PW_ONE] + paddw xmm0,[rel PW_ONE] + psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200)) + psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200)) + + paddw xmm6,xmm5 + paddw xmm4,xmm2 + paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H + paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L + paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H + paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L + + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H + + movdqa xmm6,xmm5 + movdqa xmm7,xmm2 + punpcklwd xmm5,xmm1 + punpckhwd xmm6,xmm1 + pmaddwd xmm5,[rel PW_MF0344_F0285] + pmaddwd xmm6,[rel PW_MF0344_F0285] + punpcklwd xmm2,xmm3 + punpckhwd xmm7,xmm3 + pmaddwd xmm2,[rel PW_MF0344_F0285] + pmaddwd xmm7,[rel PW_MF0344_F0285] + + paddd xmm5,[rel PD_ONEHALF] + paddd xmm6,[rel PD_ONEHALF] + psrad xmm5,SCALEBITS + psrad xmm6,SCALEBITS + paddd xmm2,[rel PD_ONEHALF] + paddd xmm7,[rel PD_ONEHALF] + psrad xmm2,SCALEBITS + psrad xmm7,SCALEBITS + + packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) + packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) + psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H + psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L + + movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H + + mov al,2 ; Yctr + jmp short .Yloop_1st + +.Yloop_2nd: + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H + +.Yloop_1st: + movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF) + + pcmpeqw xmm6,xmm6 + psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE + psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO + + movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H) + movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H) + movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H) + + paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) + paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) + packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) + + paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) + paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) + packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) + + paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) + paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) + packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG,xmmA + movdqa xmmH,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC,xmmD + movdqa xmmB,xmmD + punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF,xmmE + punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB,xmmE + punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB,xmmF + punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test rdi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF +.out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_XMMWORD + jz near .endcolumn + + add rsi, byte SIZEOF_XMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add rbx, byte SIZEOF_XMMWORD ; inptr1 + add rdx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + +.column_st32: + lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE + cmp rcx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmF + sub rcx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 +.column_st16: + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st15 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub rcx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_MMWORD + jb short .column_st7 + movq XMM_MMWORD [rdi], xmmA + add rdi, byte SIZEOF_MMWORD + sub rcx, byte SIZEOF_MMWORD + psrldq xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_DWORD + jb short .column_st3 + movd XMM_DWORD [rdi], xmmA + add rdi, byte SIZEOF_DWORD + sub rcx, byte SIZEOF_DWORD + psrldq xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of rax to the output when it has enough + ; space. + movd eax, xmmA + cmp rcx, byte SIZEOF_WORD + jb short .column_st1 + mov WORD [rdi], ax + add rdi, byte SIZEOF_WORD + sub rcx, byte SIZEOF_WORD + shr rax, 16 +.column_st1: + ; Store the lower 1 byte of rax to the output when it has enough + ; space. + test rcx, rcx + jz short .endcolumn + mov BYTE [rdi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) +%else + pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) +%endif + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG,xmmB + punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH,xmmC + punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test rdi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH +.out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_XMMWORD + jz near .endcolumn + + add rsi, byte SIZEOF_XMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add rbx, byte SIZEOF_XMMWORD ; inptr1 + add rdx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + +.column_st32: + cmp rcx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmC + movdqa xmmD,xmmH + sub rcx, byte SIZEOF_XMMWORD/2 +.column_st16: + cmp rcx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub rcx, byte SIZEOF_XMMWORD/4 +.column_st15: + ; Store two pixels (8 bytes) of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_XMMWORD/8 + jb short .column_st7 + movq XMM_MMWORD [rdi], xmmA + add rdi, byte SIZEOF_XMMWORD/8*4 + sub rcx, byte SIZEOF_XMMWORD/8 + psrldq xmmA, SIZEOF_XMMWORD/8*4 +.column_st7: + ; Store one pixel (4 bytes) of xmmA to the output when it has enough + ; space. + test rcx, rcx + jz short .endcolumn + movd XMM_DWORD [rdi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + +.endcolumn: + sfence ; flush the write buffer + +.return: + pop rbx + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +; r10 = JDIMENSION output_width +; r11 = JSAMPIMAGE input_buf +; r12 = JDIMENSION in_row_group_ctr +; r13 = JSAMPARRAY output_buf + + align 16 + global EXTN(jsimd_h2v2_merged_upsample_sse2) + +EXTN(jsimd_h2v2_merged_upsample_sse2): + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + push rbx + + mov eax, r10d + + mov rdi, r11 + mov ecx, r12d + mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + mov rdi, r13 + lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] + + push rdx ; inptr2 + push rbx ; inptr1 + push rsi ; inptr00 + mov rbx,rsp + + push rdi + push rcx + push rax + + %ifdef WIN64 + mov r8, rcx + mov r9, rdi + mov rcx, rax + mov rdx, rbx + %else + mov rdx, rcx + mov rcx, rdi + mov rdi, rax + mov rsi, rbx + %endif + + call EXTN(jsimd_h2v1_merged_upsample_sse2) + + pop rax + pop rcx + pop rdi + pop rsi + pop rbx + pop rdx + + add rdi, byte SIZEOF_JSAMPROW ; outptr1 + add rsi, byte SIZEOF_JSAMPROW ; inptr01 + + push rdx ; inptr2 + push rbx ; inptr1 + push rsi ; inptr00 + mov rbx,rsp + + push rdi + push rcx + push rax + + %ifdef WIN64 + mov r8, rcx + mov r9, rdi + mov rcx, rax + mov rdx, rbx + %else + mov rdx, rcx + mov rcx, rdi + mov rdi, rax + mov rsi, rbx + %endif + + call EXTN(jsimd_h2v1_merged_upsample_sse2) + + pop rax + pop rcx + pop rdi + pop rsi + pop rbx + pop rdx + + pop rbx + uncollect_args + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jdmrgext-sse2.asm b/media/libjpeg/simd/jdmrgext-sse2.asm new file mode 100644 index 000000000..b50f698b4 --- /dev/null +++ b/media/libjpeg/simd/jdmrgext-sse2.asm @@ -0,0 +1,518 @@ +; +; jdmrgext.asm - merged upsampling/color conversion (SSE2) +; +; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2012, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +%define output_width(b) (b)+8 ; JDIMENSION output_width +%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 3 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr + + align 16 + global EXTN(jsimd_h2v1_merged_upsample_sse2) + +EXTN(jsimd_h2v1_merged_upsample_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [output_width(eax)] ; col + test ecx,ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [in_row_group_ctr(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(eax)] + mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0 + mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1 + mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + + pop ecx ; col + + alignx 16,7 +.columnloop: + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF) + movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF) + + pxor xmm1,xmm1 ; xmm1=(all 0's) + pcmpeqw xmm3,xmm3 + psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + movdqa xmm4,xmm6 + punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH + punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL + movdqa xmm0,xmm7 + punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH + punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL + + paddw xmm6,xmm3 + paddw xmm4,xmm3 + paddw xmm7,xmm3 + paddw xmm0,xmm3 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm5,xmm6 ; xmm5=CbH + movdqa xmm2,xmm4 ; xmm2=CbL + paddw xmm6,xmm6 ; xmm6=2*CbH + paddw xmm4,xmm4 ; xmm4=2*CbL + movdqa xmm1,xmm7 ; xmm1=CrH + movdqa xmm3,xmm0 ; xmm3=CrL + paddw xmm7,xmm7 ; xmm7=2*CrH + paddw xmm0,xmm0 ; xmm0=2*CrL + + pmulhw xmm6,[GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800)) + pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800)) + pmulhw xmm7,[GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200)) + pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200)) + + paddw xmm6,[GOTOFF(eax,PW_ONE)] + paddw xmm4,[GOTOFF(eax,PW_ONE)] + psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800)) + psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800)) + paddw xmm7,[GOTOFF(eax,PW_ONE)] + paddw xmm0,[GOTOFF(eax,PW_ONE)] + psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200)) + psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200)) + + paddw xmm6,xmm5 + paddw xmm4,xmm2 + paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H + paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L + paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H + paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L + + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H + + movdqa xmm6,xmm5 + movdqa xmm7,xmm2 + punpcklwd xmm5,xmm1 + punpckhwd xmm6,xmm1 + pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm6,[GOTOFF(eax,PW_MF0344_F0285)] + punpcklwd xmm2,xmm3 + punpckhwd xmm7,xmm3 + pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm7,[GOTOFF(eax,PW_MF0344_F0285)] + + paddd xmm5,[GOTOFF(eax,PD_ONEHALF)] + paddd xmm6,[GOTOFF(eax,PD_ONEHALF)] + psrad xmm5,SCALEBITS + psrad xmm6,SCALEBITS + paddd xmm2,[GOTOFF(eax,PD_ONEHALF)] + paddd xmm7,[GOTOFF(eax,PD_ONEHALF)] + psrad xmm2,SCALEBITS + psrad xmm7,SCALEBITS + + packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) + packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) + psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H + psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L + + movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H + + mov al,2 ; Yctr + jmp short .Yloop_1st + alignx 16,7 + +.Yloop_2nd: + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H + alignx 16,7 + +.Yloop_1st: + movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF) + + pcmpeqw xmm6,xmm6 + psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE + psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO + + movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H) + movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H) + movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H) + + paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) + paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) + packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) + + paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) + paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) + packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) + + paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) + paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) + packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG,xmmA + movdqa xmmH,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC,xmmD + movdqa xmmB,xmmD + punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF,xmmE + punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB,xmmE + punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB,xmmF + punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF +.out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_XMMWORD + jz near .endcolumn + + add esi, byte SIZEOF_XMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16,7 + +.column_st32: + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmF + sub ecx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 +.column_st16: + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st15 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub ecx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st7 + movq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_MMWORD + sub ecx, byte SIZEOF_MMWORD + psrldq xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_DWORD + jb short .column_st3 + movd XMM_DWORD [edi], xmmA + add edi, byte SIZEOF_DWORD + sub ecx, byte SIZEOF_DWORD + psrldq xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of eax to the output when it has enough + ; space. + movd eax, xmmA + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov WORD [edi], ax + add edi, byte SIZEOF_WORD + sub ecx, byte SIZEOF_WORD + shr eax, 16 +.column_st1: + ; Store the lower 1 byte of eax to the output when it has enough + ; space. + test ecx, ecx + jz short .endcolumn + mov BYTE [edi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) +%else + pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) +%endif + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG,xmmB + punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH,xmmC + punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH +.out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_XMMWORD + jz near .endcolumn + + add esi, byte SIZEOF_XMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16,7 + +.column_st32: + cmp ecx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmC + movdqa xmmD,xmmH + sub ecx, byte SIZEOF_XMMWORD/2 +.column_st16: + cmp ecx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub ecx, byte SIZEOF_XMMWORD/4 +.column_st15: + ; Store two pixels (8 bytes) of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_XMMWORD/8 + jb short .column_st7 + movq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_XMMWORD/8*4 + sub ecx, byte SIZEOF_XMMWORD/8 + psrldq xmmA, SIZEOF_XMMWORD/8*4 +.column_st7: + ; Store one pixel (4 bytes) of xmmA to the output when it has enough + ; space. + test ecx, ecx + jz short .endcolumn + movd XMM_DWORD [edi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + +.endcolumn: + sfence ; flush the write buffer + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. +; +; GLOBAL(void) +; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width, +; JSAMPIMAGE input_buf, +; JDIMENSION in_row_group_ctr, +; JSAMPARRAY output_buf); +; + +%define output_width(b) (b)+8 ; JDIMENSION output_width +%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf + + align 16 + global EXTN(jsimd_h2v2_merged_upsample_sse2) + +EXTN(jsimd_h2v2_merged_upsample_sse2): + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov eax, POINTER [output_width(ebp)] + + mov edi, JSAMPIMAGE [input_buf(ebp)] + mov ecx, JDIMENSION [in_row_group_ctr(ebp)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(ebp)] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + + push edx ; inptr2 + push ebx ; inptr1 + push esi ; inptr00 + mov ebx,esp + + push edi ; output_buf (outptr0) + push ecx ; in_row_group_ctr + push ebx ; input_buf + push eax ; output_width + + call near EXTN(jsimd_h2v1_merged_upsample_sse2) + + add esi, byte SIZEOF_JSAMPROW ; inptr01 + add edi, byte SIZEOF_JSAMPROW ; outptr1 + mov POINTER [ebx+0*SIZEOF_POINTER], esi + mov POINTER [ebx-1*SIZEOF_POINTER], edi + + call near EXTN(jsimd_h2v1_merged_upsample_sse2) + + add esp, byte 7*SIZEOF_DWORD + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jdsample-altivec.c b/media/libjpeg/simd/jdsample-altivec.c new file mode 100644 index 000000000..b40ce55c8 --- /dev/null +++ b/media/libjpeg/simd/jdsample-altivec.c @@ -0,0 +1,392 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* CHROMA UPSAMPLING */ + +#include "jsimd_altivec.h" + + +void +jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor, + JDIMENSION downsampled_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr, outptr; + int inrow, incol; + + __vector unsigned char this0, last0, p_last0, next0 = {0}, p_next0, + out; + __vector short this0e, this0o, this0l, this0h, last0l, last0h, + next0l, next0h, outle, outhe, outlo, outho; + + /* Constants */ + __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) }, + last_index_col0 = {0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14}, + last_index = {15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30}, + next_index = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}, + next_index_lastcol = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15}, +#if __BIG_ENDIAN__ + merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31}; +#else + merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30}; +#endif + __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) }; + + for (inrow = 0; inrow < max_v_samp_factor; inrow++) { + inptr = input_data[inrow]; + outptr = output_data[inrow]; + + if (downsampled_width & 15) + inptr[downsampled_width] = inptr[downsampled_width - 1]; + + this0 = vec_ld(0, inptr); + p_last0 = vec_perm(this0, this0, last_index_col0); + last0 = this0; + + for (incol = downsampled_width; incol > 0; + incol -= 16, inptr += 16, outptr += 32) { + + if (downsampled_width - incol > 0) { + p_last0 = vec_perm(last0, this0, last_index); + last0 = this0; + } + + if (incol <= 16) + p_next0 = vec_perm(this0, this0, next_index_lastcol); + else { + next0 = vec_ld(16, inptr); + p_next0 = vec_perm(this0, next0, next_index); + } + + this0e = (__vector short)vec_mule(this0, pb_three); + this0o = (__vector short)vec_mulo(this0, pb_three); + this0l = vec_mergeh(this0e, this0o); + this0h = vec_mergel(this0e, this0o); + + last0l = (__vector short)VEC_UNPACKHU(p_last0); + last0h = (__vector short)VEC_UNPACKLU(p_last0); + last0l = vec_add(last0l, pw_one); + + next0l = (__vector short)VEC_UNPACKHU(p_next0); + next0h = (__vector short)VEC_UNPACKLU(p_next0); + next0l = vec_add(next0l, pw_two); + + outle = vec_add(this0l, last0l); + outlo = vec_add(this0l, next0l); + outle = vec_sr(outle, (__vector unsigned short)pw_two); + outlo = vec_sr(outlo, (__vector unsigned short)pw_two); + + out = vec_perm((__vector unsigned char)outle, + (__vector unsigned char)outlo, merge_pack_index); + vec_st(out, 0, outptr); + + if (incol > 8) { + last0h = vec_add(last0h, pw_one); + next0h = vec_add(next0h, pw_two); + + outhe = vec_add(this0h, last0h); + outho = vec_add(this0h, next0h); + outhe = vec_sr(outhe, (__vector unsigned short)pw_two); + outho = vec_sr(outho, (__vector unsigned short)pw_two); + + out = vec_perm((__vector unsigned char)outhe, + (__vector unsigned char)outho, merge_pack_index); + vec_st(out, 16, outptr); + } + + this0 = next0; + } + } +} + + +void +jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor, + JDIMENSION downsampled_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1; + int inrow, outrow, incol; + + __vector unsigned char this_1, this0, this1, out; + __vector short this_1l, this_1h, this0l, this0h, this1l, this1h, + lastcolsum_1h, lastcolsum1h, + p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h, + thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h, + nextcolsum_1l = {0}, nextcolsum_1h = {0}, + nextcolsum1l = {0}, nextcolsum1h = {0}, + p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h, + tmpl, tmph, outle, outhe, outlo, outho; + + /* Constants */ + __vector unsigned char pb_zero = { __16X(0) }, + last_index_col0 = {0,1,0,1,2,3,4,5,6,7,8,9,10,11,12,13}, + last_index={14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29}, + next_index = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17}, + next_index_lastcol = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15}, +#if __BIG_ENDIAN__ + merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31}; +#else + merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30}; +#endif + __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) }, + pw_seven = { __8X(7) }, pw_eight = { __8X(8) }; + __vector unsigned short pw_four = { __8X(4) }; + + for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { + + inptr_1 = input_data[inrow - 1]; + inptr0 = input_data[inrow]; + inptr1 = input_data[inrow + 1]; + outptr0 = output_data[outrow++]; + outptr1 = output_data[outrow++]; + + if (downsampled_width & 15) { + inptr_1[downsampled_width] = inptr_1[downsampled_width - 1]; + inptr0[downsampled_width] = inptr0[downsampled_width - 1]; + inptr1[downsampled_width] = inptr1[downsampled_width - 1]; + } + + this0 = vec_ld(0, inptr0); + this0l = (__vector short)VEC_UNPACKHU(this0); + this0h = (__vector short)VEC_UNPACKLU(this0); + this0l = vec_mladd(this0l, pw_three, pw_zero); + this0h = vec_mladd(this0h, pw_three, pw_zero); + + this_1 = vec_ld(0, inptr_1); + this_1l = (__vector short)VEC_UNPACKHU(this_1); + this_1h = (__vector short)VEC_UNPACKLU(this_1); + thiscolsum_1l = vec_add(this0l, this_1l); + thiscolsum_1h = vec_add(this0h, this_1h); + lastcolsum_1h = thiscolsum_1h; + p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0); + p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index); + + this1 = vec_ld(0, inptr1); + this1l = (__vector short)VEC_UNPACKHU(this1); + this1h = (__vector short)VEC_UNPACKLU(this1); + thiscolsum1l = vec_add(this0l, this1l); + thiscolsum1h = vec_add(this0h, this1h); + lastcolsum1h = thiscolsum1h; + p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0); + p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index); + + for (incol = downsampled_width; incol > 0; + incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16, + outptr0 += 32, outptr1 += 32) { + + if (downsampled_width - incol > 0) { + p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index); + p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index); + p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index); + p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index); + lastcolsum_1h = thiscolsum_1h; lastcolsum1h = thiscolsum1h; + } + + if (incol <= 16) { + p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index); + p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h, + next_index_lastcol); + p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index); + p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h, + next_index_lastcol); + } else { + this0 = vec_ld(16, inptr0); + this0l = (__vector short)VEC_UNPACKHU(this0); + this0h = (__vector short)VEC_UNPACKLU(this0); + this0l = vec_mladd(this0l, pw_three, pw_zero); + this0h = vec_mladd(this0h, pw_three, pw_zero); + + this_1 = vec_ld(16, inptr_1); + this_1l = (__vector short)VEC_UNPACKHU(this_1); + this_1h = (__vector short)VEC_UNPACKLU(this_1); + nextcolsum_1l = vec_add(this0l, this_1l); + nextcolsum_1h = vec_add(this0h, this_1h); + p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index); + p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index); + + this1 = vec_ld(16, inptr1); + this1l = (__vector short)VEC_UNPACKHU(this1); + this1h = (__vector short)VEC_UNPACKLU(this1); + nextcolsum1l = vec_add(this0l, this1l); + nextcolsum1h = vec_add(this0h, this1h); + p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index); + p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index); + } + + /* Process the upper row */ + + tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero); + outle = vec_add(tmpl, p_lastcolsum_1l); + outle = vec_add(outle, pw_eight); + outle = vec_sr(outle, pw_four); + + outlo = vec_add(tmpl, p_nextcolsum_1l); + outlo = vec_add(outlo, pw_seven); + outlo = vec_sr(outlo, pw_four); + + out = vec_perm((__vector unsigned char)outle, + (__vector unsigned char)outlo, merge_pack_index); + vec_st(out, 0, outptr0); + + if (incol > 8) { + tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero); + outhe = vec_add(tmph, p_lastcolsum_1h); + outhe = vec_add(outhe, pw_eight); + outhe = vec_sr(outhe, pw_four); + + outho = vec_add(tmph, p_nextcolsum_1h); + outho = vec_add(outho, pw_seven); + outho = vec_sr(outho, pw_four); + + out = vec_perm((__vector unsigned char)outhe, + (__vector unsigned char)outho, merge_pack_index); + vec_st(out, 16, outptr0); + } + + /* Process the lower row */ + + tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero); + outle = vec_add(tmpl, p_lastcolsum1l); + outle = vec_add(outle, pw_eight); + outle = vec_sr(outle, pw_four); + + outlo = vec_add(tmpl, p_nextcolsum1l); + outlo = vec_add(outlo, pw_seven); + outlo = vec_sr(outlo, pw_four); + + out = vec_perm((__vector unsigned char)outle, + (__vector unsigned char)outlo, merge_pack_index); + vec_st(out, 0, outptr1); + + if (incol > 8) { + tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero); + outhe = vec_add(tmph, p_lastcolsum1h); + outhe = vec_add(outhe, pw_eight); + outhe = vec_sr(outhe, pw_four); + + outho = vec_add(tmph, p_nextcolsum1h); + outho = vec_add(outho, pw_seven); + outho = vec_sr(outho, pw_four); + + out = vec_perm((__vector unsigned char)outhe, + (__vector unsigned char)outho, merge_pack_index); + vec_st(out, 16, outptr1); + } + + thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h; + thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h; + } + } +} + + +/* These are rarely used (mainly just for decompressing YCCK images) */ + +void +jsimd_h2v1_upsample_altivec (int max_v_samp_factor, + JDIMENSION output_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr, outptr; + int inrow, incol; + + __vector unsigned char in, inl, inh; + + for (inrow = 0; inrow < max_v_samp_factor; inrow++) { + inptr = input_data[inrow]; + outptr = output_data[inrow]; + + for (incol = (output_width + 31) & (~31); incol > 0; + incol -= 64, inptr += 32, outptr += 64) { + + in = vec_ld(0, inptr); + inl = vec_mergeh(in, in); + inh = vec_mergel(in, in); + + vec_st(inl, 0, outptr); + vec_st(inh, 16, outptr); + + if (incol > 32) { + in = vec_ld(16, inptr); + inl = vec_mergeh(in, in); + inh = vec_mergel(in, in); + + vec_st(inl, 32, outptr); + vec_st(inh, 48, outptr); + } + } + } +} + + +void +jsimd_h2v2_upsample_altivec (int max_v_samp_factor, + JDIMENSION output_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr, outptr0, outptr1; + int inrow, outrow, incol; + + __vector unsigned char in, inl, inh; + + for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { + + inptr = input_data[inrow]; + outptr0 = output_data[outrow++]; + outptr1 = output_data[outrow++]; + + for (incol = (output_width + 31) & (~31); incol > 0; + incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) { + + in = vec_ld(0, inptr); + inl = vec_mergeh(in, in); + inh = vec_mergel(in, in); + + vec_st(inl, 0, outptr0); + vec_st(inl, 0, outptr1); + + vec_st(inh, 16, outptr0); + vec_st(inh, 16, outptr1); + + if (incol > 32) { + in = vec_ld(16, inptr); + inl = vec_mergeh(in, in); + inh = vec_mergel(in, in); + + vec_st(inl, 32, outptr0); + vec_st(inl, 32, outptr1); + + vec_st(inh, 48, outptr0); + vec_st(inh, 48, outptr1); + } + } + } +} diff --git a/media/libjpeg/simd/jdsample-mmx.asm b/media/libjpeg/simd/jdsample-mmx.asm new file mode 100644 index 000000000..5e4fa7ae2 --- /dev/null +++ b/media/libjpeg/simd/jdsample-mmx.asm @@ -0,0 +1,736 @@ +; +; jdsample.asm - upsampling (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_fancy_upsample_mmx) + +EXTN(jconst_fancy_upsample_mmx): + +PW_ONE times 4 dw 1 +PW_TWO times 4 dw 2 +PW_THREE times 4 dw 3 +PW_SEVEN times 4 dw 7 +PW_EIGHT times 4 dw 8 + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. +; +; The upsampling algorithm is linear interpolation between pixel centers, +; also known as a "triangle filter". This is a good compromise between +; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 +; of the way between input pixel centers. +; +; GLOBAL(void) +; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr + + align 16 + global EXTN(jsimd_h2v1_fancy_upsample_mmx) + +EXTN(jsimd_h2v1_fancy_upsample_mmx): + push ebp + mov ebp,esp + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr + test eax,eax + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx,ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 +.rowloop: + push eax ; colctr + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + + test eax, SIZEOF_MMWORD-1 + jz short .skip + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample +.skip: + pxor mm0,mm0 ; mm0=(all 0's) + pcmpeqb mm7,mm7 + psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT + pand mm7, MMWORD [esi+0*SIZEOF_MMWORD] + + add eax, byte SIZEOF_MMWORD-1 + and eax, byte -SIZEOF_MMWORD + cmp eax, byte SIZEOF_MMWORD + ja short .columnloop + alignx 16,7 + +.columnloop_last: + pcmpeqb mm6,mm6 + psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT + pand mm6, MMWORD [esi+0*SIZEOF_MMWORD] + jmp short .upsample + alignx 16,7 + +.columnloop: + movq mm6, MMWORD [esi+1*SIZEOF_MMWORD] + psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT + +.upsample: + movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] + movq mm2,mm1 + movq mm3,mm1 ; mm1=( 0 1 2 3 4 5 6 7) + psllq mm2,BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6) + psrlq mm3,BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -) + + por mm2,mm7 ; mm2=(-1 0 1 2 3 4 5 6) + por mm3,mm6 ; mm3=( 1 2 3 4 5 6 7 8) + + movq mm7,mm1 + psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -) + + movq mm4,mm1 + punpcklbw mm1,mm0 ; mm1=( 0 1 2 3) + punpckhbw mm4,mm0 ; mm4=( 4 5 6 7) + movq mm5,mm2 + punpcklbw mm2,mm0 ; mm2=(-1 0 1 2) + punpckhbw mm5,mm0 ; mm5=( 3 4 5 6) + movq mm6,mm3 + punpcklbw mm3,mm0 ; mm3=( 1 2 3 4) + punpckhbw mm6,mm0 ; mm6=( 5 6 7 8) + + pmullw mm1,[GOTOFF(ebx,PW_THREE)] + pmullw mm4,[GOTOFF(ebx,PW_THREE)] + paddw mm2,[GOTOFF(ebx,PW_ONE)] + paddw mm5,[GOTOFF(ebx,PW_ONE)] + paddw mm3,[GOTOFF(ebx,PW_TWO)] + paddw mm6,[GOTOFF(ebx,PW_TWO)] + + paddw mm2,mm1 + paddw mm5,mm4 + psrlw mm2,2 ; mm2=OutLE=( 0 2 4 6) + psrlw mm5,2 ; mm5=OutHE=( 8 10 12 14) + paddw mm3,mm1 + paddw mm6,mm4 + psrlw mm3,2 ; mm3=OutLO=( 1 3 5 7) + psrlw mm6,2 ; mm6=OutHO=( 9 11 13 15) + + psllw mm3,BYTE_BIT + psllw mm6,BYTE_BIT + por mm2,mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7) + por mm5,mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15) + + movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm5 + + sub eax, byte SIZEOF_MMWORD + add esi, byte 1*SIZEOF_MMWORD ; inptr + add edi, byte 2*SIZEOF_MMWORD ; outptr + cmp eax, byte SIZEOF_MMWORD + ja near .columnloop + test eax,eax + jnz near .columnloop_last + + pop esi + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. +; Again a triangle filter; see comments for h2v1 case, above. +; +; GLOBAL(void) +; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 4 +%define gotptr wk(0)-SIZEOF_POINTER ; void *gotptr + + align 16 + global EXTN(jsimd_h2v2_fancy_upsample_mmx) + +EXTN(jsimd_h2v2_fancy_upsample_mmx): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov edx,eax ; edx = original ebp + mov eax, JDIMENSION [downsamp_width(edx)] ; colctr + test eax,eax + jz near .return + + mov ecx, INT [max_v_samp(edx)] ; rowctr + test ecx,ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(edx)] ; input_data + mov edi, POINTER [output_data_ptr(edx)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 +.rowloop: + push eax ; colctr + push ecx + push edi + push esi + + mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + + test eax, SIZEOF_MMWORD-1 + jz short .skip + push edx + mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + pop edx +.skip: + ; -- process the first column block + + movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0] + movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0] + movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0] + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pxor mm3,mm3 ; mm3=(all 0's) + movq mm4,mm0 + punpcklbw mm0,mm3 ; mm0=row[ 0][0]( 0 1 2 3) + punpckhbw mm4,mm3 ; mm4=row[ 0][0]( 4 5 6 7) + movq mm5,mm1 + punpcklbw mm1,mm3 ; mm1=row[-1][0]( 0 1 2 3) + punpckhbw mm5,mm3 ; mm5=row[-1][0]( 4 5 6 7) + movq mm6,mm2 + punpcklbw mm2,mm3 ; mm2=row[+1][0]( 0 1 2 3) + punpckhbw mm6,mm3 ; mm6=row[+1][0]( 4 5 6 7) + + pmullw mm0,[GOTOFF(ebx,PW_THREE)] + pmullw mm4,[GOTOFF(ebx,PW_THREE)] + + pcmpeqb mm7,mm7 + psrlq mm7,(SIZEOF_MMWORD-2)*BYTE_BIT + + paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3) + paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7) + paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3) + paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7) + + movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save + movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data + movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm6 + + pand mm1,mm7 ; mm1=( 0 - - -) + pand mm2,mm7 ; mm2=( 0 - - -) + + movq MMWORD [wk(0)], mm1 + movq MMWORD [wk(1)], mm2 + + poppic ebx + + add eax, byte SIZEOF_MMWORD-1 + and eax, byte -SIZEOF_MMWORD + cmp eax, byte SIZEOF_MMWORD + ja short .columnloop + alignx 16,7 + +.columnloop_last: + ; -- process the last column block + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pcmpeqb mm1,mm1 + psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT + movq mm2,mm1 + + pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7) + pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7) + + movq MMWORD [wk(2)], mm1 + movq MMWORD [wk(3)], mm2 + + jmp short .upsample + alignx 16,7 + +.columnloop: + ; -- process the next column block + + movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1] + movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1] + movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1] + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pxor mm3,mm3 ; mm3=(all 0's) + movq mm4,mm0 + punpcklbw mm0,mm3 ; mm0=row[ 0][1]( 0 1 2 3) + punpckhbw mm4,mm3 ; mm4=row[ 0][1]( 4 5 6 7) + movq mm5,mm1 + punpcklbw mm1,mm3 ; mm1=row[-1][1]( 0 1 2 3) + punpckhbw mm5,mm3 ; mm5=row[-1][1]( 4 5 6 7) + movq mm6,mm2 + punpcklbw mm2,mm3 ; mm2=row[+1][1]( 0 1 2 3) + punpckhbw mm6,mm3 ; mm6=row[+1][1]( 4 5 6 7) + + pmullw mm0,[GOTOFF(ebx,PW_THREE)] + pmullw mm4,[GOTOFF(ebx,PW_THREE)] + + paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3) + paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7) + paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3) + paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7) + + movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save + movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data + movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+3*SIZEOF_MMWORD], mm6 + + psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0) + psllq mm2,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0) + + movq MMWORD [wk(2)], mm1 + movq MMWORD [wk(3)], mm2 + +.upsample: + ; -- process the upper row + + movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3) + movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7) + + movq mm0,mm7 + movq mm4,mm3 + psrlq mm0,2*BYTE_BIT ; mm0=( 1 2 3 -) + psllq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4) + movq mm5,mm7 + movq mm6,mm3 + psrlq mm5,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -) + psllq mm6,2*BYTE_BIT ; mm6=( - 4 5 6) + + por mm0,mm4 ; mm0=( 1 2 3 4) + por mm5,mm6 ; mm5=( 3 4 5 6) + + movq mm1,mm7 + movq mm2,mm3 + psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2) + psrlq mm2,2*BYTE_BIT ; mm2=( 5 6 7 -) + movq mm4,mm3 + psrlq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -) + + por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2) + por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8) + + movq MMWORD [wk(0)], mm4 + + pmullw mm7,[GOTOFF(ebx,PW_THREE)] + pmullw mm3,[GOTOFF(ebx,PW_THREE)] + paddw mm1,[GOTOFF(ebx,PW_EIGHT)] + paddw mm5,[GOTOFF(ebx,PW_EIGHT)] + paddw mm0,[GOTOFF(ebx,PW_SEVEN)] + paddw mm2,[GOTOFF(ebx,PW_SEVEN)] + + paddw mm1,mm7 + paddw mm5,mm3 + psrlw mm1,4 ; mm1=Out0LE=( 0 2 4 6) + psrlw mm5,4 ; mm5=Out0HE=( 8 10 12 14) + paddw mm0,mm7 + paddw mm2,mm3 + psrlw mm0,4 ; mm0=Out0LO=( 1 3 5 7) + psrlw mm2,4 ; mm2=Out0HO=( 9 11 13 15) + + psllw mm0,BYTE_BIT + psllw mm2,BYTE_BIT + por mm1,mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7) + por mm5,mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15) + + movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 + movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 + + ; -- process the lower row + + movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3) + movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7) + + movq mm7,mm6 + movq mm3,mm4 + psrlq mm7,2*BYTE_BIT ; mm7=( 1 2 3 -) + psllq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4) + movq mm0,mm6 + movq mm2,mm4 + psrlq mm0,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -) + psllq mm2,2*BYTE_BIT ; mm2=( - 4 5 6) + + por mm7,mm3 ; mm7=( 1 2 3 4) + por mm0,mm2 ; mm0=( 3 4 5 6) + + movq mm1,mm6 + movq mm5,mm4 + psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2) + psrlq mm5,2*BYTE_BIT ; mm5=( 5 6 7 -) + movq mm3,mm4 + psrlq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -) + + por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2) + por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8) + + movq MMWORD [wk(1)], mm3 + + pmullw mm6,[GOTOFF(ebx,PW_THREE)] + pmullw mm4,[GOTOFF(ebx,PW_THREE)] + paddw mm1,[GOTOFF(ebx,PW_EIGHT)] + paddw mm0,[GOTOFF(ebx,PW_EIGHT)] + paddw mm7,[GOTOFF(ebx,PW_SEVEN)] + paddw mm5,[GOTOFF(ebx,PW_SEVEN)] + + paddw mm1,mm6 + paddw mm0,mm4 + psrlw mm1,4 ; mm1=Out1LE=( 0 2 4 6) + psrlw mm0,4 ; mm0=Out1HE=( 8 10 12 14) + paddw mm7,mm6 + paddw mm5,mm4 + psrlw mm7,4 ; mm7=Out1LO=( 1 3 5 7) + psrlw mm5,4 ; mm5=Out1HO=( 9 11 13 15) + + psllw mm7,BYTE_BIT + psllw mm5,BYTE_BIT + por mm1,mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7) + por mm0,mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15) + + movq MMWORD [edi+0*SIZEOF_MMWORD], mm1 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm0 + + poppic ebx + + sub eax, byte SIZEOF_MMWORD + add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above) + add ebx, byte 1*SIZEOF_MMWORD ; inptr0 + add esi, byte 1*SIZEOF_MMWORD ; inptr1(below) + add edx, byte 2*SIZEOF_MMWORD ; outptr0 + add edi, byte 2*SIZEOF_MMWORD ; outptr1 + cmp eax, byte SIZEOF_MMWORD + ja near .columnloop + test eax,eax + jnz near .columnloop_last + + pop esi + pop edi + pop ecx + pop eax + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v1_upsample_mmx (int max_v_samp_factor, +; JDIMENSION output_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define output_width(b) (b)+12 ; JDIMENSION output_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr + + align 16 + global EXTN(jsimd_h2v1_upsample_mmx) + +EXTN(jsimd_h2v1_upsample_mmx): + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (2*SIZEOF_MMWORD)-1 + and edx, byte -(2*SIZEOF_MMWORD) + jz short .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx,ecx + jz short .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 +.rowloop: + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + mov eax,edx ; colctr + alignx 16,7 +.columnloop: + + movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] + + movq mm1,mm0 + punpcklbw mm0,mm0 + punpckhbw mm1,mm1 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 + + sub eax, byte 2*SIZEOF_MMWORD + jz short .nextrow + + movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] + + movq mm3,mm2 + punpcklbw mm2,mm2 + punpckhbw mm3,mm3 + + movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 + + sub eax, byte 2*SIZEOF_MMWORD + jz short .nextrow + + add esi, byte 2*SIZEOF_MMWORD ; inptr + add edi, byte 4*SIZEOF_MMWORD ; outptr + jmp short .columnloop + alignx 16,7 + +.nextrow: + pop esi + pop edi + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg short .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v2_upsample_mmx (int max_v_samp_factor, +; JDIMENSION output_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define output_width(b) (b)+12 ; JDIMENSION output_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr + + align 16 + global EXTN(jsimd_h2v2_upsample_mmx) + +EXTN(jsimd_h2v2_upsample_mmx): + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (2*SIZEOF_MMWORD)-1 + and edx, byte -(2*SIZEOF_MMWORD) + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx,ecx + jz short .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 +.rowloop: + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + mov eax,edx ; colctr + alignx 16,7 +.columnloop: + + movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] + + movq mm1,mm0 + punpcklbw mm0,mm0 + punpckhbw mm1,mm1 + + movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0 + movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1 + movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 + + sub eax, byte 2*SIZEOF_MMWORD + jz short .nextrow + + movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] + + movq mm3,mm2 + punpcklbw mm2,mm2 + punpckhbw mm3,mm3 + + movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2 + movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3 + movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 + + sub eax, byte 2*SIZEOF_MMWORD + jz short .nextrow + + add esi, byte 2*SIZEOF_MMWORD ; inptr + add ebx, byte 4*SIZEOF_MMWORD ; outptr0 + add edi, byte 4*SIZEOF_MMWORD ; outptr1 + jmp short .columnloop + alignx 16,7 + +.nextrow: + pop esi + pop edi + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg short .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jdsample-sse2-64.asm b/media/libjpeg/simd/jdsample-sse2-64.asm new file mode 100644 index 000000000..1faaed648 --- /dev/null +++ b/media/libjpeg/simd/jdsample-sse2-64.asm @@ -0,0 +1,670 @@ +; +; jdsample.asm - upsampling (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_fancy_upsample_sse2) + +EXTN(jconst_fancy_upsample_sse2): + +PW_ONE times 8 dw 1 +PW_TWO times 8 dw 2 +PW_THREE times 8 dw 3 +PW_SEVEN times 8 dw 7 +PW_EIGHT times 8 dw 8 + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. +; +; The upsampling algorithm is linear interpolation between pixel centers, +; also known as a "triangle filter". This is a good compromise between +; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 +; of the way between input pixel centers. +; +; GLOBAL(void) +; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +; r10 = int max_v_samp_factor +; r11 = JDIMENSION downsampled_width +; r12 = JSAMPARRAY input_data +; r13 = JSAMPARRAY *output_data_ptr + + align 16 + global EXTN(jsimd_h2v1_fancy_upsample_sse2) + +EXTN(jsimd_h2v1_fancy_upsample_sse2): + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + + mov eax, r11d ; colctr + test rax,rax + jz near .return + + mov rcx, r10 ; rowctr + test rcx,rcx + jz near .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdi, JSAMPARRAY [rdi] ; output_data +.rowloop: + push rax ; colctr + push rdi + push rsi + + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr + + test rax, SIZEOF_XMMWORD-1 + jz short .skip + mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample +.skip: + pxor xmm0,xmm0 ; xmm0=(all 0's) + pcmpeqb xmm7,xmm7 + psrldq xmm7,(SIZEOF_XMMWORD-1) + pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD] + + add rax, byte SIZEOF_XMMWORD-1 + and rax, byte -SIZEOF_XMMWORD + cmp rax, byte SIZEOF_XMMWORD + ja short .columnloop + +.columnloop_last: + pcmpeqb xmm6,xmm6 + pslldq xmm6,(SIZEOF_XMMWORD-1) + pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD] + jmp short .upsample + +.columnloop: + movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD] + pslldq xmm6,(SIZEOF_XMMWORD-1) + +.upsample: + movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqa xmm2,xmm1 + movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15) + pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14) + psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --) + + por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14) + por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16) + + movdqa xmm7,xmm1 + psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) + + movdqa xmm4,xmm1 + punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) + punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) + movdqa xmm5,xmm2 + punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) + punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) + movdqa xmm6,xmm3 + punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) + punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) + + pmullw xmm1,[rel PW_THREE] + pmullw xmm4,[rel PW_THREE] + paddw xmm2,[rel PW_ONE] + paddw xmm5,[rel PW_ONE] + paddw xmm3,[rel PW_TWO] + paddw xmm6,[rel PW_TWO] + + paddw xmm2,xmm1 + paddw xmm5,xmm4 + psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) + psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) + paddw xmm3,xmm1 + paddw xmm6,xmm4 + psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) + psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) + + psllw xmm3,BYTE_BIT + psllw xmm6,BYTE_BIT + por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) + por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5 + + sub rax, byte SIZEOF_XMMWORD + add rsi, byte 1*SIZEOF_XMMWORD ; inptr + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + cmp rax, byte SIZEOF_XMMWORD + ja near .columnloop + test eax,eax + jnz near .columnloop_last + + pop rsi + pop rdi + pop rax + + add rsi, byte SIZEOF_JSAMPROW ; input_data + add rdi, byte SIZEOF_JSAMPROW ; output_data + dec rcx ; rowctr + jg near .rowloop + +.return: + uncollect_args + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. +; Again a triangle filter; see comments for h2v1 case, above. +; +; GLOBAL(void) +; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +; r10 = int max_v_samp_factor +; r11 = JDIMENSION downsampled_width +; r12 = JSAMPARRAY input_data +; r13 = JSAMPARRAY *output_data_ptr + +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 4 + + align 16 + global EXTN(jsimd_h2v2_fancy_upsample_sse2) + +EXTN(jsimd_h2v2_fancy_upsample_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + push rbx + + mov eax, r11d ; colctr + test rax,rax + jz near .return + + mov rcx, r10 ; rowctr + test rcx,rcx + jz near .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdi, JSAMPARRAY [rdi] ; output_data +.rowloop: + push rax ; colctr + push rcx + push rdi + push rsi + + mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above) + mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 + mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below) + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 + mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 + + test rax, SIZEOF_XMMWORD-1 + jz short .skip + push rdx + mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + pop rdx +.skip: + ; -- process the first column block + + movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] + movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] + movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] + + pxor xmm3,xmm3 ; xmm3=(all 0's) + movdqa xmm4,xmm0 + punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5,xmm1 + punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6,xmm2 + punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + + pmullw xmm0,[rel PW_THREE] + pmullw xmm4,[rel PW_THREE] + + pcmpeqb xmm7,xmm7 + psrldq xmm7,(SIZEOF_XMMWORD-2) + + paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + + movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6 + + pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) + pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) + + movdqa XMMWORD [wk(0)], xmm1 + movdqa XMMWORD [wk(1)], xmm2 + + add rax, byte SIZEOF_XMMWORD-1 + and rax, byte -SIZEOF_XMMWORD + cmp rax, byte SIZEOF_XMMWORD + ja short .columnloop + +.columnloop_last: + ; -- process the last column block + + pcmpeqb xmm1,xmm1 + pslldq xmm1,(SIZEOF_XMMWORD-2) + movdqa xmm2,xmm1 + + pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD] + pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD] + + movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) + movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) + + jmp near .upsample + +.columnloop: + ; -- process the next column block + + movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] + movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] + movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] + + pxor xmm3,xmm3 ; xmm3=(all 0's) + movdqa xmm4,xmm0 + punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5,xmm1 + punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6,xmm2 + punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + + pmullw xmm0,[rel PW_THREE] + pmullw xmm4,[rel PW_THREE] + + paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + + movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6 + + pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) + pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) + + movdqa XMMWORD [wk(2)], xmm1 + movdqa XMMWORD [wk(3)], xmm2 + +.upsample: + ; -- process the upper row + + movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD] + + movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) + movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) + psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --) + pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) + movdqa xmm5,xmm7 + movdqa xmm6,xmm3 + psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) + pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14) + + por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) + por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) + + movdqa xmm1,xmm7 + movdqa xmm2,xmm3 + pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --) + movdqa xmm4,xmm3 + psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(0)], xmm4 + + pmullw xmm7,[rel PW_THREE] + pmullw xmm3,[rel PW_THREE] + paddw xmm1,[rel PW_EIGHT] + paddw xmm5,[rel PW_EIGHT] + paddw xmm0,[rel PW_SEVEN] + paddw xmm2,[rel PW_SEVEN] + + paddw xmm1,xmm7 + paddw xmm5,xmm3 + psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) + psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) + paddw xmm0,xmm7 + paddw xmm2,xmm3 + psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) + psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) + + psllw xmm0,BYTE_BIT + psllw xmm2,BYTE_BIT + por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) + por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 + + ; -- process the lower row + + movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD] + movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD] + + movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) + movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) + psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --) + pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) + movdqa xmm0,xmm6 + movdqa xmm2,xmm4 + psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) + pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14) + + por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) + por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) + + movdqa xmm1,xmm6 + movdqa xmm5,xmm4 + pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --) + movdqa xmm3,xmm4 + psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(1)], xmm3 + + pmullw xmm6,[rel PW_THREE] + pmullw xmm4,[rel PW_THREE] + paddw xmm1,[rel PW_EIGHT] + paddw xmm0,[rel PW_EIGHT] + paddw xmm7,[rel PW_SEVEN] + paddw xmm5,[rel PW_SEVEN] + + paddw xmm1,xmm6 + paddw xmm0,xmm4 + psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) + psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) + paddw xmm7,xmm6 + paddw xmm5,xmm4 + psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) + psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) + + psllw xmm7,BYTE_BIT + psllw xmm5,BYTE_BIT + por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) + por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0 + + sub rax, byte SIZEOF_XMMWORD + add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above) + add rbx, byte 1*SIZEOF_XMMWORD ; inptr0 + add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below) + add rdx, byte 2*SIZEOF_XMMWORD ; outptr0 + add rdi, byte 2*SIZEOF_XMMWORD ; outptr1 + cmp rax, byte SIZEOF_XMMWORD + ja near .columnloop + test rax,rax + jnz near .columnloop_last + + pop rsi + pop rdi + pop rcx + pop rax + + add rsi, byte 1*SIZEOF_JSAMPROW ; input_data + add rdi, byte 2*SIZEOF_JSAMPROW ; output_data + sub rcx, byte 2 ; rowctr + jg near .rowloop + +.return: + pop rbx + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor, +; JDIMENSION output_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +; r10 = int max_v_samp_factor +; r11 = JDIMENSION output_width +; r12 = JSAMPARRAY input_data +; r13 = JSAMPARRAY *output_data_ptr + + align 16 + global EXTN(jsimd_h2v1_upsample_sse2) + +EXTN(jsimd_h2v1_upsample_sse2): + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + + mov edx, r11d + add rdx, byte (2*SIZEOF_XMMWORD)-1 + and rdx, byte -(2*SIZEOF_XMMWORD) + jz near .return + + mov rcx, r10 ; rowctr + test rcx,rcx + jz short .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdi, JSAMPARRAY [rdi] ; output_data +.rowloop: + push rdi + push rsi + + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr + mov rax,rdx ; colctr +.columnloop: + + movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + + movdqa xmm1,xmm0 + punpcklbw xmm0,xmm0 + punpckhbw xmm1,xmm1 + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 + + sub rax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] + + movdqa xmm3,xmm2 + punpcklbw xmm2,xmm2 + punpckhbw xmm3,xmm3 + + movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 + + sub rax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + add rsi, byte 2*SIZEOF_XMMWORD ; inptr + add rdi, byte 4*SIZEOF_XMMWORD ; outptr + jmp short .columnloop + +.nextrow: + pop rsi + pop rdi + + add rsi, byte SIZEOF_JSAMPROW ; input_data + add rdi, byte SIZEOF_JSAMPROW ; output_data + dec rcx ; rowctr + jg short .rowloop + +.return: + uncollect_args + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor, +; JDIMENSION output_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +; r10 = int max_v_samp_factor +; r11 = JDIMENSION output_width +; r12 = JSAMPARRAY input_data +; r13 = JSAMPARRAY *output_data_ptr + + align 16 + global EXTN(jsimd_h2v2_upsample_sse2) + +EXTN(jsimd_h2v2_upsample_sse2): + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + push rbx + + mov edx, r11d + add rdx, byte (2*SIZEOF_XMMWORD)-1 + and rdx, byte -(2*SIZEOF_XMMWORD) + jz near .return + + mov rcx, r10 ; rowctr + test rcx,rcx + jz near .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdi, JSAMPARRAY [rdi] ; output_data +.rowloop: + push rdi + push rsi + + mov rsi, JSAMPROW [rsi] ; inptr + mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 + mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 + mov rax,rdx ; colctr +.columnloop: + + movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + + movdqa xmm1,xmm0 + punpcklbw xmm0,xmm0 + punpckhbw xmm1,xmm1 + + movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 + + sub rax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] + + movdqa xmm3,xmm2 + punpcklbw xmm2,xmm2 + punpckhbw xmm3,xmm3 + + movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3 + movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 + + sub rax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + add rsi, byte 2*SIZEOF_XMMWORD ; inptr + add rbx, byte 4*SIZEOF_XMMWORD ; outptr0 + add rdi, byte 4*SIZEOF_XMMWORD ; outptr1 + jmp short .columnloop + +.nextrow: + pop rsi + pop rdi + + add rsi, byte 1*SIZEOF_JSAMPROW ; input_data + add rdi, byte 2*SIZEOF_JSAMPROW ; output_data + sub rcx, byte 2 ; rowctr + jg near .rowloop + +.return: + pop rbx + uncollect_args + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jdsample-sse2.asm b/media/libjpeg/simd/jdsample-sse2.asm new file mode 100644 index 000000000..1d0059e80 --- /dev/null +++ b/media/libjpeg/simd/jdsample-sse2.asm @@ -0,0 +1,728 @@ +; +; jdsample.asm - upsampling (SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_fancy_upsample_sse2) + +EXTN(jconst_fancy_upsample_sse2): + +PW_ONE times 8 dw 1 +PW_TWO times 8 dw 2 +PW_THREE times 8 dw 3 +PW_SEVEN times 8 dw 7 +PW_EIGHT times 8 dw 8 + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. +; +; The upsampling algorithm is linear interpolation between pixel centers, +; also known as a "triangle filter". This is a good compromise between +; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 +; of the way between input pixel centers. +; +; GLOBAL(void) +; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr + + align 16 + global EXTN(jsimd_h2v1_fancy_upsample_sse2) + +EXTN(jsimd_h2v1_fancy_upsample_sse2): + push ebp + mov ebp,esp + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr + test eax,eax + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx,ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 +.rowloop: + push eax ; colctr + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + + test eax, SIZEOF_XMMWORD-1 + jz short .skip + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample +.skip: + pxor xmm0,xmm0 ; xmm0=(all 0's) + pcmpeqb xmm7,xmm7 + psrldq xmm7,(SIZEOF_XMMWORD-1) + pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD] + + add eax, byte SIZEOF_XMMWORD-1 + and eax, byte -SIZEOF_XMMWORD + cmp eax, byte SIZEOF_XMMWORD + ja short .columnloop + alignx 16,7 + +.columnloop_last: + pcmpeqb xmm6,xmm6 + pslldq xmm6,(SIZEOF_XMMWORD-1) + pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD] + jmp short .upsample + alignx 16,7 + +.columnloop: + movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD] + pslldq xmm6,(SIZEOF_XMMWORD-1) + +.upsample: + movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm2,xmm1 + movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15) + pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14) + psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --) + + por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14) + por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16) + + movdqa xmm7,xmm1 + psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) + + movdqa xmm4,xmm1 + punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) + punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) + movdqa xmm5,xmm2 + punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) + punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) + movdqa xmm6,xmm3 + punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) + punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) + + pmullw xmm1,[GOTOFF(ebx,PW_THREE)] + pmullw xmm4,[GOTOFF(ebx,PW_THREE)] + paddw xmm2,[GOTOFF(ebx,PW_ONE)] + paddw xmm5,[GOTOFF(ebx,PW_ONE)] + paddw xmm3,[GOTOFF(ebx,PW_TWO)] + paddw xmm6,[GOTOFF(ebx,PW_TWO)] + + paddw xmm2,xmm1 + paddw xmm5,xmm4 + psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) + psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) + paddw xmm3,xmm1 + paddw xmm6,xmm4 + psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) + psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) + + psllw xmm3,BYTE_BIT + psllw xmm6,BYTE_BIT + por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) + por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5 + + sub eax, byte SIZEOF_XMMWORD + add esi, byte 1*SIZEOF_XMMWORD ; inptr + add edi, byte 2*SIZEOF_XMMWORD ; outptr + cmp eax, byte SIZEOF_XMMWORD + ja near .columnloop + test eax,eax + jnz near .columnloop_last + + pop esi + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. +; Again a triangle filter; see comments for h2v1 case, above. +; +; GLOBAL(void) +; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor, +; JDIMENSION downsampled_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 4 +%define gotptr wk(0)-SIZEOF_POINTER ; void *gotptr + + align 16 + global EXTN(jsimd_h2v2_fancy_upsample_sse2) + +EXTN(jsimd_h2v2_fancy_upsample_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov edx,eax ; edx = original ebp + mov eax, JDIMENSION [downsamp_width(edx)] ; colctr + test eax,eax + jz near .return + + mov ecx, INT [max_v_samp(edx)] ; rowctr + test ecx,ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(edx)] ; input_data + mov edi, POINTER [output_data_ptr(edx)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 +.rowloop: + push eax ; colctr + push ecx + push edi + push esi + + mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + + test eax, SIZEOF_XMMWORD-1 + jz short .skip + push edx + mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + pop edx +.skip: + ; -- process the first column block + + movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] + movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] + movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pxor xmm3,xmm3 ; xmm3=(all 0's) + movdqa xmm4,xmm0 + punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5,xmm1 + punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6,xmm2 + punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + + pmullw xmm0,[GOTOFF(ebx,PW_THREE)] + pmullw xmm4,[GOTOFF(ebx,PW_THREE)] + + pcmpeqb xmm7,xmm7 + psrldq xmm7,(SIZEOF_XMMWORD-2) + + paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + + movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6 + + pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) + pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) + + movdqa XMMWORD [wk(0)], xmm1 + movdqa XMMWORD [wk(1)], xmm2 + + poppic ebx + + add eax, byte SIZEOF_XMMWORD-1 + and eax, byte -SIZEOF_XMMWORD + cmp eax, byte SIZEOF_XMMWORD + ja short .columnloop + alignx 16,7 + +.columnloop_last: + ; -- process the last column block + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pcmpeqb xmm1,xmm1 + pslldq xmm1,(SIZEOF_XMMWORD-2) + movdqa xmm2,xmm1 + + pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD] + pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD] + + movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) + movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) + + jmp near .upsample + alignx 16,7 + +.columnloop: + ; -- process the next column block + + movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] + movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] + movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pxor xmm3,xmm3 ; xmm3=(all 0's) + movdqa xmm4,xmm0 + punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5,xmm1 + punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6,xmm2 + punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + + pmullw xmm0,[GOTOFF(ebx,PW_THREE)] + pmullw xmm4,[GOTOFF(ebx,PW_THREE)] + + paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + + movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6 + + pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) + pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) + + movdqa XMMWORD [wk(2)], xmm1 + movdqa XMMWORD [wk(3)], xmm2 + +.upsample: + ; -- process the upper row + + movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD] + + movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) + movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) + psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --) + pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) + movdqa xmm5,xmm7 + movdqa xmm6,xmm3 + psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) + pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14) + + por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) + por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) + + movdqa xmm1,xmm7 + movdqa xmm2,xmm3 + pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --) + movdqa xmm4,xmm3 + psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(0)], xmm4 + + pmullw xmm7,[GOTOFF(ebx,PW_THREE)] + pmullw xmm3,[GOTOFF(ebx,PW_THREE)] + paddw xmm1,[GOTOFF(ebx,PW_EIGHT)] + paddw xmm5,[GOTOFF(ebx,PW_EIGHT)] + paddw xmm0,[GOTOFF(ebx,PW_SEVEN)] + paddw xmm2,[GOTOFF(ebx,PW_SEVEN)] + + paddw xmm1,xmm7 + paddw xmm5,xmm3 + psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) + psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) + paddw xmm0,xmm7 + paddw xmm2,xmm3 + psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) + psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) + + psllw xmm0,BYTE_BIT + psllw xmm2,BYTE_BIT + por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) + por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 + + ; -- process the lower row + + movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD] + movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD] + + movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) + movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) + psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --) + pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) + movdqa xmm0,xmm6 + movdqa xmm2,xmm4 + psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) + pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14) + + por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) + por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) + + movdqa xmm1,xmm6 + movdqa xmm5,xmm4 + pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --) + movdqa xmm3,xmm4 + psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(1)], xmm3 + + pmullw xmm6,[GOTOFF(ebx,PW_THREE)] + pmullw xmm4,[GOTOFF(ebx,PW_THREE)] + paddw xmm1,[GOTOFF(ebx,PW_EIGHT)] + paddw xmm0,[GOTOFF(ebx,PW_EIGHT)] + paddw xmm7,[GOTOFF(ebx,PW_SEVEN)] + paddw xmm5,[GOTOFF(ebx,PW_SEVEN)] + + paddw xmm1,xmm6 + paddw xmm0,xmm4 + psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) + psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) + paddw xmm7,xmm6 + paddw xmm5,xmm4 + psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) + psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) + + psllw xmm7,BYTE_BIT + psllw xmm5,BYTE_BIT + por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) + por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0 + + poppic ebx + + sub eax, byte SIZEOF_XMMWORD + add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above) + add ebx, byte 1*SIZEOF_XMMWORD ; inptr0 + add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below) + add edx, byte 2*SIZEOF_XMMWORD ; outptr0 + add edi, byte 2*SIZEOF_XMMWORD ; outptr1 + cmp eax, byte SIZEOF_XMMWORD + ja near .columnloop + test eax,eax + jnz near .columnloop_last + + pop esi + pop edi + pop ecx + pop eax + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor, +; JDIMENSION output_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define output_width(b) (b)+12 ; JDIMENSION output_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr + + align 16 + global EXTN(jsimd_h2v1_upsample_sse2) + +EXTN(jsimd_h2v1_upsample_sse2): + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (2*SIZEOF_XMMWORD)-1 + and edx, byte -(2*SIZEOF_XMMWORD) + jz short .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx,ecx + jz short .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 +.rowloop: + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + mov eax,edx ; colctr + alignx 16,7 +.columnloop: + + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + + movdqa xmm1,xmm0 + punpcklbw xmm0,xmm0 + punpckhbw xmm1,xmm1 + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 + + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] + + movdqa xmm3,xmm2 + punpcklbw xmm2,xmm2 + punpckhbw xmm3,xmm3 + + movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 + + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + add esi, byte 2*SIZEOF_XMMWORD ; inptr + add edi, byte 4*SIZEOF_XMMWORD ; outptr + jmp short .columnloop + alignx 16,7 + +.nextrow: + pop esi + pop edi + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg short .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. +; It's still a box filter. +; +; GLOBAL(void) +; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor, +; JDIMENSION output_width, +; JSAMPARRAY input_data, +; JSAMPARRAY *output_data_ptr); +; + +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define output_width(b) (b)+12 ; JDIMENSION output_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr + + align 16 + global EXTN(jsimd_h2v2_upsample_sse2) + +EXTN(jsimd_h2v2_upsample_sse2): + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (2*SIZEOF_XMMWORD)-1 + and edx, byte -(2*SIZEOF_XMMWORD) + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx,ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 +.rowloop: + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + mov eax,edx ; colctr + alignx 16,7 +.columnloop: + + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + + movdqa xmm1,xmm0 + punpcklbw xmm0,xmm0 + punpckhbw xmm1,xmm1 + + movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 + + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] + + movdqa xmm3,xmm2 + punpcklbw xmm2,xmm2 + punpckhbw xmm3,xmm3 + + movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3 + movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 + + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + add esi, byte 2*SIZEOF_XMMWORD ; inptr + add ebx, byte 4*SIZEOF_XMMWORD ; outptr0 + add edi, byte 4*SIZEOF_XMMWORD ; outptr1 + jmp short .columnloop + alignx 16,7 + +.nextrow: + pop esi + pop edi + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg short .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jfdctflt-3dn.asm b/media/libjpeg/simd/jfdctflt-3dn.asm new file mode 100644 index 000000000..219161819 --- /dev/null +++ b/media/libjpeg/simd/jfdctflt-3dn.asm @@ -0,0 +1,319 @@ +; +; jfdctflt.asm - floating-point FDCT (3DNow!) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the forward DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_fdct_float_3dnow) + +EXTN(jconst_fdct_float_3dnow): + +PD_0_382 times 2 dd 0.382683432365089771728460 +PD_0_707 times 2 dd 0.707106781186547524400844 +PD_0_541 times 2 dd 0.541196100146196984399723 +PD_1_306 times 2 dd 1.306562964876376527856643 + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_float_3dnow (FAST_FLOAT *data) +; + +%define data(b) (b)+8 ; FAST_FLOAT *data + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + global EXTN(jsimd_fdct_float_3dnow) + +EXTN(jsimd_fdct_float_3dnow): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) + mov ecx, DCTSIZE/2 + alignx 16,7 +.rowloop: + + movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)] + + ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17) + + movq mm4,mm0 ; transpose coefficients + punpckldq mm0,mm1 ; mm0=(00 10)=data0 + punpckhdq mm4,mm1 ; mm4=(01 11)=data1 + movq mm5,mm2 ; transpose coefficients + punpckldq mm2,mm3 ; mm2=(06 16)=data6 + punpckhdq mm5,mm3 ; mm5=(07 17)=data7 + + movq mm6,mm4 + movq mm7,mm0 + pfsub mm4,mm2 ; mm4=data1-data6=tmp6 + pfsub mm0,mm5 ; mm0=data0-data7=tmp7 + pfadd mm6,mm2 ; mm6=data1+data6=tmp1 + pfadd mm7,mm5 ; mm7=data0+data7=tmp0 + + movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)] + movq mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)] + + ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15) + + movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 + + movq mm4,mm1 ; transpose coefficients + punpckldq mm1,mm3 ; mm1=(02 12)=data2 + punpckhdq mm4,mm3 ; mm4=(03 13)=data3 + movq mm0,mm2 ; transpose coefficients + punpckldq mm2,mm5 ; mm2=(04 14)=data4 + punpckhdq mm0,mm5 ; mm0=(05 15)=data5 + + movq mm3,mm4 + movq mm5,mm1 + pfadd mm4,mm2 ; mm4=data3+data4=tmp3 + pfadd mm1,mm0 ; mm1=data2+data5=tmp2 + pfsub mm3,mm2 ; mm3=data3-data4=tmp4 + pfsub mm5,mm0 ; mm5=data2-data5=tmp5 + + ; -- Even part + + movq mm2,mm7 + movq mm0,mm6 + pfsub mm7,mm4 ; mm7=tmp13 + pfsub mm6,mm1 ; mm6=tmp12 + pfadd mm2,mm4 ; mm2=tmp10 + pfadd mm0,mm1 ; mm0=tmp11 + + pfadd mm6,mm7 + pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1 + + movq mm4,mm2 + movq mm1,mm7 + pfsub mm2,mm0 ; mm2=data4 + pfsub mm7,mm6 ; mm7=data6 + pfadd mm4,mm0 ; mm4=data0 + pfadd mm1,mm6 ; mm1=data2 + + movq MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2 + movq MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1 + + ; -- Odd part + + movq mm0, MMWORD [wk(0)] ; mm0=tmp6 + movq mm6, MMWORD [wk(1)] ; mm6=tmp7 + + pfadd mm3,mm5 ; mm3=tmp10 + pfadd mm5,mm0 ; mm5=tmp11 + pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7 + + pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3 + + movq mm2,mm3 ; mm2=tmp10 + pfsub mm3,mm0 + pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5 + pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) + pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) + pfadd mm2,mm3 ; mm2=z2 + pfadd mm0,mm3 ; mm0=z4 + + movq mm7,mm6 + pfsub mm6,mm5 ; mm6=z13 + pfadd mm7,mm5 ; mm7=z11 + + movq mm4,mm6 + movq mm1,mm7 + pfsub mm6,mm2 ; mm6=data3 + pfsub mm7,mm0 ; mm7=data7 + pfadd mm4,mm2 ; mm4=data5 + pfadd mm1,mm0 ; mm1=data1 + + movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6 + movq MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 + + add edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) + mov ecx, DCTSIZE/2 + alignx 16,7 +.columnloop: + + movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)] + + ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71) + + movq mm4,mm0 ; transpose coefficients + punpckldq mm0,mm1 ; mm0=(00 01)=data0 + punpckhdq mm4,mm1 ; mm4=(10 11)=data1 + movq mm5,mm2 ; transpose coefficients + punpckldq mm2,mm3 ; mm2=(60 61)=data6 + punpckhdq mm5,mm3 ; mm5=(70 71)=data7 + + movq mm6,mm4 + movq mm7,mm0 + pfsub mm4,mm2 ; mm4=data1-data6=tmp6 + pfsub mm0,mm5 ; mm0=data0-data7=tmp7 + pfadd mm6,mm2 ; mm6=data1+data6=tmp1 + pfadd mm7,mm5 ; mm7=data0+data7=tmp0 + + movq mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)] + movq mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)] + + ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51) + + movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 + + movq mm4,mm1 ; transpose coefficients + punpckldq mm1,mm3 ; mm1=(20 21)=data2 + punpckhdq mm4,mm3 ; mm4=(30 31)=data3 + movq mm0,mm2 ; transpose coefficients + punpckldq mm2,mm5 ; mm2=(40 41)=data4 + punpckhdq mm0,mm5 ; mm0=(50 51)=data5 + + movq mm3,mm4 + movq mm5,mm1 + pfadd mm4,mm2 ; mm4=data3+data4=tmp3 + pfadd mm1,mm0 ; mm1=data2+data5=tmp2 + pfsub mm3,mm2 ; mm3=data3-data4=tmp4 + pfsub mm5,mm0 ; mm5=data2-data5=tmp5 + + ; -- Even part + + movq mm2,mm7 + movq mm0,mm6 + pfsub mm7,mm4 ; mm7=tmp13 + pfsub mm6,mm1 ; mm6=tmp12 + pfadd mm2,mm4 ; mm2=tmp10 + pfadd mm0,mm1 ; mm0=tmp11 + + pfadd mm6,mm7 + pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1 + + movq mm4,mm2 + movq mm1,mm7 + pfsub mm2,mm0 ; mm2=data4 + pfsub mm7,mm6 ; mm7=data6 + pfadd mm4,mm0 ; mm4=data0 + pfadd mm1,mm6 ; mm1=data2 + + movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2 + movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1 + + ; -- Odd part + + movq mm0, MMWORD [wk(0)] ; mm0=tmp6 + movq mm6, MMWORD [wk(1)] ; mm6=tmp7 + + pfadd mm3,mm5 ; mm3=tmp10 + pfadd mm5,mm0 ; mm5=tmp11 + pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7 + + pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3 + + movq mm2,mm3 ; mm2=tmp10 + pfsub mm3,mm0 + pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5 + pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) + pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) + pfadd mm2,mm3 ; mm2=z2 + pfadd mm0,mm3 ; mm0=z4 + + movq mm7,mm6 + pfsub mm6,mm5 ; mm6=z13 + pfadd mm7,mm5 ; mm7=z11 + + movq mm4,mm6 + movq mm1,mm7 + pfsub mm6,mm2 ; mm6=data3 + pfsub mm7,mm0 ; mm7=data7 + pfadd mm4,mm2 ; mm4=data5 + pfadd mm1,mm0 ; mm1=data1 + + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6 + movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 + + add edx, byte 2*SIZEOF_FAST_FLOAT + dec ecx + jnz near .columnloop + + femms ; empty MMX/3DNow! state + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jfdctflt-sse-64.asm b/media/libjpeg/simd/jfdctflt-sse-64.asm new file mode 100644 index 000000000..4b64ea4bb --- /dev/null +++ b/media/libjpeg/simd/jfdctflt-sse-64.asm @@ -0,0 +1,357 @@ +; +; jfdctflt.asm - floating-point FDCT (64-bit SSE) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the forward DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1,%2,0x44 +%endmacro + +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1,%2,0xEE +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_fdct_float_sse) + +EXTN(jconst_fdct_float_sse): + +PD_0_382 times 4 dd 0.382683432365089771728460 +PD_0_707 times 4 dd 0.707106781186547524400844 +PD_0_541 times 4 dd 0.541196100146196984399723 +PD_1_306 times 4 dd 1.306562964876376527856643 + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_float_sse (FAST_FLOAT *data) +; + +; r10 = FAST_FLOAT *data + +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + global EXTN(jsimd_fdct_float_sse) + +EXTN(jsimd_fdct_float_sse): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + + ; ---- Pass 1: process rows. + + mov rdx, r10 ; (FAST_FLOAT *) + mov rcx, DCTSIZE/4 +.rowloop: + + movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)] + + ; xmm0=(20 21 22 23), xmm2=(24 25 26 27) + ; xmm1=(30 31 32 33), xmm3=(34 35 36 37) + + movaps xmm4,xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31) + unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33) + movaps xmm5,xmm2 ; transpose coefficients(phase 1) + unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35) + unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37) + + movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] + + ; xmm6=(00 01 02 03), xmm1=(04 05 06 07) + ; xmm7=(10 11 12 13), xmm3=(14 15 16 17) + + movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33) + movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35) + + movaps xmm4,xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13) + movaps xmm2,xmm1 ; transpose coefficients(phase 1) + unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15) + unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17) + + movaps xmm7,xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0 + unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1 + movaps xmm3,xmm2 ; transpose coefficients(phase 2) + unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6 + unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7 + + movaps xmm0,xmm7 + movaps xmm5,xmm6 + subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 + subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 + addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 + addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 + + movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33) + movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35) + movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 + movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movaps xmm7,xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2 + unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3 + movaps xmm6,xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4 + unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5 + + movaps xmm2,xmm7 + movaps xmm3,xmm4 + addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 + addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 + subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 + subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movaps xmm1,xmm5 + movaps xmm6,xmm0 + subps xmm5,xmm7 ; xmm5=tmp13 + subps xmm0,xmm4 ; xmm0=tmp12 + addps xmm1,xmm7 ; xmm1=tmp10 + addps xmm6,xmm4 ; xmm6=tmp11 + + addps xmm0,xmm5 + mulps xmm0,[rel PD_0_707] ; xmm0=z1 + + movaps xmm7,xmm1 + movaps xmm4,xmm5 + subps xmm1,xmm6 ; xmm1=data4 + subps xmm5,xmm0 ; xmm5=data6 + addps xmm7,xmm6 ; xmm7=data0 + addps xmm4,xmm0 ; xmm4=data2 + + movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 + + ; -- Odd part + + movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 + movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 + + addps xmm2,xmm3 ; xmm2=tmp10 + addps xmm3,xmm6 ; xmm3=tmp11 + addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 + + mulps xmm3,[rel PD_0_707] ; xmm3=z3 + + movaps xmm1,xmm2 ; xmm1=tmp10 + subps xmm2,xmm6 + mulps xmm2,[rel PD_0_382] ; xmm2=z5 + mulps xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) + mulps xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) + addps xmm1,xmm2 ; xmm1=z2 + addps xmm6,xmm2 ; xmm6=z4 + + movaps xmm5,xmm0 + subps xmm0,xmm3 ; xmm0=z13 + addps xmm5,xmm3 ; xmm5=z11 + + movaps xmm7,xmm0 + movaps xmm4,xmm5 + subps xmm0,xmm1 ; xmm0=data3 + subps xmm5,xmm6 ; xmm5=data7 + addps xmm7,xmm1 ; xmm7=data5 + addps xmm4,xmm6 ; xmm4=data1 + + movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 + + add rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT + dec rcx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov rdx, r10 ; (FAST_FLOAT *) + mov rcx, DCTSIZE/4 +.columnloop: + + movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)] + + ; xmm0=(02 12 22 32), xmm2=(42 52 62 72) + ; xmm1=(03 13 23 33), xmm3=(43 53 63 73) + + movaps xmm4,xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13) + unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33) + movaps xmm5,xmm2 ; transpose coefficients(phase 1) + unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53) + unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73) + + movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)] + + ; xmm6=(00 10 20 30), xmm1=(40 50 60 70) + ; xmm7=(01 11 21 31), xmm3=(41 51 61 71) + + movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33) + movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53) + + movaps xmm4,xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11) + unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31) + movaps xmm2,xmm1 ; transpose coefficients(phase 1) + unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51) + unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71) + + movaps xmm7,xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0 + unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1 + movaps xmm3,xmm2 ; transpose coefficients(phase 2) + unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6 + unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7 + + movaps xmm0,xmm7 + movaps xmm5,xmm6 + subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 + subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 + addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 + addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 + + movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33) + movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53) + movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 + movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movaps xmm7,xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2 + unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3 + movaps xmm6,xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4 + unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5 + + movaps xmm2,xmm7 + movaps xmm3,xmm4 + addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 + addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 + subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 + subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movaps xmm1,xmm5 + movaps xmm6,xmm0 + subps xmm5,xmm7 ; xmm5=tmp13 + subps xmm0,xmm4 ; xmm0=tmp12 + addps xmm1,xmm7 ; xmm1=tmp10 + addps xmm6,xmm4 ; xmm6=tmp11 + + addps xmm0,xmm5 + mulps xmm0,[rel PD_0_707] ; xmm0=z1 + + movaps xmm7,xmm1 + movaps xmm4,xmm5 + subps xmm1,xmm6 ; xmm1=data4 + subps xmm5,xmm0 ; xmm5=data6 + addps xmm7,xmm6 ; xmm7=data0 + addps xmm4,xmm0 ; xmm4=data2 + + movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 + + ; -- Odd part + + movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 + movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 + + addps xmm2,xmm3 ; xmm2=tmp10 + addps xmm3,xmm6 ; xmm3=tmp11 + addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 + + mulps xmm3,[rel PD_0_707] ; xmm3=z3 + + movaps xmm1,xmm2 ; xmm1=tmp10 + subps xmm2,xmm6 + mulps xmm2,[rel PD_0_382] ; xmm2=z5 + mulps xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) + mulps xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) + addps xmm1,xmm2 ; xmm1=z2 + addps xmm6,xmm2 ; xmm6=z4 + + movaps xmm5,xmm0 + subps xmm0,xmm3 ; xmm0=z13 + addps xmm5,xmm3 ; xmm5=z11 + + movaps xmm7,xmm0 + movaps xmm4,xmm5 + subps xmm0,xmm1 ; xmm0=data3 + subps xmm5,xmm6 ; xmm5=data7 + addps xmm7,xmm1 ; xmm7=data5 + addps xmm4,xmm6 ; xmm4=data1 + + movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 + + add rdx, byte 4*SIZEOF_FAST_FLOAT + dec rcx + jnz near .columnloop + + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jfdctflt-sse.asm b/media/libjpeg/simd/jfdctflt-sse.asm new file mode 100644 index 000000000..e7ede26c0 --- /dev/null +++ b/media/libjpeg/simd/jfdctflt-sse.asm @@ -0,0 +1,369 @@ +; +; jfdctflt.asm - floating-point FDCT (SSE) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the forward DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1,%2,0x44 +%endmacro + +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1,%2,0xEE +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_fdct_float_sse) + +EXTN(jconst_fdct_float_sse): + +PD_0_382 times 4 dd 0.382683432365089771728460 +PD_0_707 times 4 dd 0.707106781186547524400844 +PD_0_541 times 4 dd 0.541196100146196984399723 +PD_1_306 times 4 dd 1.306562964876376527856643 + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_float_sse (FAST_FLOAT *data) +; + +%define data(b) (b)+8 ; FAST_FLOAT *data + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + global EXTN(jsimd_fdct_float_sse) + +EXTN(jsimd_fdct_float_sse): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) + mov ecx, DCTSIZE/4 + alignx 16,7 +.rowloop: + + movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)] + + ; xmm0=(20 21 22 23), xmm2=(24 25 26 27) + ; xmm1=(30 31 32 33), xmm3=(34 35 36 37) + + movaps xmm4,xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31) + unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33) + movaps xmm5,xmm2 ; transpose coefficients(phase 1) + unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35) + unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37) + + movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + + ; xmm6=(00 01 02 03), xmm1=(04 05 06 07) + ; xmm7=(10 11 12 13), xmm3=(14 15 16 17) + + movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33) + movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35) + + movaps xmm4,xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13) + movaps xmm2,xmm1 ; transpose coefficients(phase 1) + unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15) + unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17) + + movaps xmm7,xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0 + unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1 + movaps xmm3,xmm2 ; transpose coefficients(phase 2) + unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6 + unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7 + + movaps xmm0,xmm7 + movaps xmm5,xmm6 + subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 + subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 + addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 + addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 + + movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33) + movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35) + movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 + movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movaps xmm7,xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2 + unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3 + movaps xmm6,xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4 + unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5 + + movaps xmm2,xmm7 + movaps xmm3,xmm4 + addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 + addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 + subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 + subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movaps xmm1,xmm5 + movaps xmm6,xmm0 + subps xmm5,xmm7 ; xmm5=tmp13 + subps xmm0,xmm4 ; xmm0=tmp12 + addps xmm1,xmm7 ; xmm1=tmp10 + addps xmm6,xmm4 ; xmm6=tmp11 + + addps xmm0,xmm5 + mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1 + + movaps xmm7,xmm1 + movaps xmm4,xmm5 + subps xmm1,xmm6 ; xmm1=data4 + subps xmm5,xmm0 ; xmm5=data6 + addps xmm7,xmm6 ; xmm7=data0 + addps xmm4,xmm0 ; xmm4=data2 + + movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4 + + ; -- Odd part + + movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 + movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 + + addps xmm2,xmm3 ; xmm2=tmp10 + addps xmm3,xmm6 ; xmm3=tmp11 + addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 + + mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3 + + movaps xmm1,xmm2 ; xmm1=tmp10 + subps xmm2,xmm6 + mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5 + mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) + mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) + addps xmm1,xmm2 ; xmm1=z2 + addps xmm6,xmm2 ; xmm6=z4 + + movaps xmm5,xmm0 + subps xmm0,xmm3 ; xmm0=z13 + addps xmm5,xmm3 ; xmm5=z11 + + movaps xmm7,xmm0 + movaps xmm4,xmm5 + subps xmm0,xmm1 ; xmm0=data3 + subps xmm5,xmm6 ; xmm5=data7 + addps xmm7,xmm1 ; xmm7=data5 + addps xmm4,xmm6 ; xmm4=data1 + + movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4 + + add edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) + mov ecx, DCTSIZE/4 + alignx 16,7 +.columnloop: + + movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)] + + ; xmm0=(02 12 22 32), xmm2=(42 52 62 72) + ; xmm1=(03 13 23 33), xmm3=(43 53 63 73) + + movaps xmm4,xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13) + unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33) + movaps xmm5,xmm2 ; transpose coefficients(phase 1) + unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53) + unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73) + + movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)] + + ; xmm6=(00 10 20 30), xmm1=(40 50 60 70) + ; xmm7=(01 11 21 31), xmm3=(41 51 61 71) + + movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33) + movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53) + + movaps xmm4,xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11) + unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31) + movaps xmm2,xmm1 ; transpose coefficients(phase 1) + unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51) + unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71) + + movaps xmm7,xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0 + unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1 + movaps xmm3,xmm2 ; transpose coefficients(phase 2) + unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6 + unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7 + + movaps xmm0,xmm7 + movaps xmm5,xmm6 + subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 + subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 + addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 + addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 + + movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33) + movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53) + movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 + movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movaps xmm7,xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2 + unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3 + movaps xmm6,xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4 + unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5 + + movaps xmm2,xmm7 + movaps xmm3,xmm4 + addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 + addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 + subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 + subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movaps xmm1,xmm5 + movaps xmm6,xmm0 + subps xmm5,xmm7 ; xmm5=tmp13 + subps xmm0,xmm4 ; xmm0=tmp12 + addps xmm1,xmm7 ; xmm1=tmp10 + addps xmm6,xmm4 ; xmm6=tmp11 + + addps xmm0,xmm5 + mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1 + + movaps xmm7,xmm1 + movaps xmm4,xmm5 + subps xmm1,xmm6 ; xmm1=data4 + subps xmm5,xmm0 ; xmm5=data6 + addps xmm7,xmm6 ; xmm7=data0 + addps xmm4,xmm0 ; xmm4=data2 + + movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4 + + ; -- Odd part + + movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 + movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 + + addps xmm2,xmm3 ; xmm2=tmp10 + addps xmm3,xmm6 ; xmm3=tmp11 + addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 + + mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3 + + movaps xmm1,xmm2 ; xmm1=tmp10 + subps xmm2,xmm6 + mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5 + mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) + mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) + addps xmm1,xmm2 ; xmm1=z2 + addps xmm6,xmm2 ; xmm6=z4 + + movaps xmm5,xmm0 + subps xmm0,xmm3 ; xmm0=z13 + addps xmm5,xmm3 ; xmm5=z11 + + movaps xmm7,xmm0 + movaps xmm4,xmm5 + subps xmm0,xmm1 ; xmm0=data3 + subps xmm5,xmm6 ; xmm5=data7 + addps xmm7,xmm1 ; xmm7=data5 + addps xmm4,xmm6 ; xmm4=data1 + + movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4 + + add edx, byte 4*SIZEOF_FAST_FLOAT + dec ecx + jnz near .columnloop + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jfdctfst-altivec.c b/media/libjpeg/simd/jfdctfst-altivec.c new file mode 100644 index 000000000..04157f77e --- /dev/null +++ b/media/libjpeg/simd/jfdctfst-altivec.c @@ -0,0 +1,156 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* FAST INTEGER FORWARD DCT + * + * This is similar to the SSE2 implementation, except that we left-shift the + * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because + * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of: + * the elements in arg3 + the most significant 17 bits of + * (the elements in arg1 * the elements in arg2). + */ + +#include "jsimd_altivec.h" + + +#define F_0_382 98 /* FIX(0.382683433) */ +#define F_0_541 139 /* FIX(0.541196100) */ +#define F_0_707 181 /* FIX(0.707106781) */ +#define F_1_306 334 /* FIX(1.306562965) */ + +#define CONST_BITS 8 +#define PRE_MULTIPLY_SCALE_BITS 2 +#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1) + + +#define DO_FDCT() \ +{ \ + /* Even part */ \ + \ + tmp10 = vec_add(tmp0, tmp3); \ + tmp13 = vec_sub(tmp0, tmp3); \ + tmp11 = vec_add(tmp1, tmp2); \ + tmp12 = vec_sub(tmp1, tmp2); \ + \ + out0 = vec_add(tmp10, tmp11); \ + out4 = vec_sub(tmp10, tmp11); \ + \ + z1 = vec_add(tmp12, tmp13); \ + z1 = vec_sl(z1, pre_multiply_scale_bits); \ + z1 = vec_madds(z1, pw_0707, pw_zero); \ + \ + out2 = vec_add(tmp13, z1); \ + out6 = vec_sub(tmp13, z1); \ + \ + /* Odd part */ \ + \ + tmp10 = vec_add(tmp4, tmp5); \ + tmp11 = vec_add(tmp5, tmp6); \ + tmp12 = vec_add(tmp6, tmp7); \ + \ + tmp10 = vec_sl(tmp10, pre_multiply_scale_bits); \ + tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \ + z5 = vec_sub(tmp10, tmp12); \ + z5 = vec_madds(z5, pw_0382, pw_zero); \ + \ + z2 = vec_madds(tmp10, pw_0541, z5); \ + z4 = vec_madds(tmp12, pw_1306, z5); \ + \ + tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \ + z3 = vec_madds(tmp11, pw_0707, pw_zero); \ + \ + z11 = vec_add(tmp7, z3); \ + z13 = vec_sub(tmp7, z3); \ + \ + out5 = vec_add(z13, z2); \ + out3 = vec_sub(z13, z2); \ + out1 = vec_add(z11, z4); \ + out7 = vec_sub(z11, z4); \ +} + + +void +jsimd_fdct_ifast_altivec (DCTELEM *data) +{ + __vector short row0, row1, row2, row3, row4, row5, row6, row7, + col0, col1, col2, col3, col4, col5, col6, col7, + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, + z1, z2, z3, z4, z5, z11, z13, + out0, out1, out2, out3, out4, out5, out6, out7; + + /* Constants */ + __vector short pw_zero = { __8X(0) }, + pw_0382 = { __8X(F_0_382 << CONST_SHIFT) }, + pw_0541 = { __8X(F_0_541 << CONST_SHIFT) }, + pw_0707 = { __8X(F_0_707 << CONST_SHIFT) }, + pw_1306 = { __8X(F_1_306 << CONST_SHIFT) }; + __vector unsigned short + pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) }; + + /* Pass 1: process rows */ + + row0 = vec_ld(0, data); + row1 = vec_ld(16, data); + row2 = vec_ld(32, data); + row3 = vec_ld(48, data); + row4 = vec_ld(64, data); + row5 = vec_ld(80, data); + row6 = vec_ld(96, data); + row7 = vec_ld(112, data); + + TRANSPOSE(row, col); + + tmp0 = vec_add(col0, col7); + tmp7 = vec_sub(col0, col7); + tmp1 = vec_add(col1, col6); + tmp6 = vec_sub(col1, col6); + tmp2 = vec_add(col2, col5); + tmp5 = vec_sub(col2, col5); + tmp3 = vec_add(col3, col4); + tmp4 = vec_sub(col3, col4); + + DO_FDCT(); + + /* Pass 2: process columns */ + + TRANSPOSE(out, row); + + tmp0 = vec_add(row0, row7); + tmp7 = vec_sub(row0, row7); + tmp1 = vec_add(row1, row6); + tmp6 = vec_sub(row1, row6); + tmp2 = vec_add(row2, row5); + tmp5 = vec_sub(row2, row5); + tmp3 = vec_add(row3, row4); + tmp4 = vec_sub(row3, row4); + + DO_FDCT(); + + vec_st(out0, 0, data); + vec_st(out1, 16, data); + vec_st(out2, 32, data); + vec_st(out3, 48, data); + vec_st(out4, 64, data); + vec_st(out5, 80, data); + vec_st(out6, 96, data); + vec_st(out7, 112, data); +} diff --git a/media/libjpeg/simd/jfdctfst-mmx.asm b/media/libjpeg/simd/jfdctfst-mmx.asm new file mode 100644 index 000000000..eb2eb9c50 --- /dev/null +++ b/media/libjpeg/simd/jfdctfst-mmx.asm @@ -0,0 +1,396 @@ +; +; jfdctfst.asm - fast integer FDCT (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a fast, not so accurate integer implementation of +; the forward DCT (Discrete Cosine Transform). The following code is +; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c +; for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 8 ; 14 is also OK. + +%if CONST_BITS == 8 +F_0_382 equ 98 ; FIX(0.382683433) +F_0_541 equ 139 ; FIX(0.541196100) +F_0_707 equ 181 ; FIX(0.707106781) +F_1_306 equ 334 ; FIX(1.306562965) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781) +F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 16 + global EXTN(jconst_fdct_ifast_mmx) + +EXTN(jconst_fdct_ifast_mmx): + +PW_F0707 times 4 dw F_0_707 << CONST_SHIFT +PW_F0382 times 4 dw F_0_382 << CONST_SHIFT +PW_F0541 times 4 dw F_0_541 << CONST_SHIFT +PW_F1306 times 4 dw F_1_306 << CONST_SHIFT + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_ifast_mmx (DCTELEM *data) +; + +%define data(b) (b)+8 ; DCTELEM *data + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + global EXTN(jsimd_fdct_ifast_mmx) + +EXTN(jsimd_fdct_ifast_mmx): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16,7 +.rowloop: + + movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)] + + ; mm0=(20 21 22 23), mm2=(24 25 26 27) + ; mm1=(30 31 32 33), mm3=(34 35 36 37) + + movq mm4,mm0 ; transpose coefficients(phase 1) + punpcklwd mm0,mm1 ; mm0=(20 30 21 31) + punpckhwd mm4,mm1 ; mm4=(22 32 23 33) + movq mm5,mm2 ; transpose coefficients(phase 1) + punpcklwd mm2,mm3 ; mm2=(24 34 25 35) + punpckhwd mm5,mm3 ; mm5=(26 36 27 37) + + movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)] + + ; mm6=(00 01 02 03), mm1=(04 05 06 07) + ; mm7=(10 11 12 13), mm3=(14 15 16 17) + + movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33) + movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35) + + movq mm4,mm6 ; transpose coefficients(phase 1) + punpcklwd mm6,mm7 ; mm6=(00 10 01 11) + punpckhwd mm4,mm7 ; mm4=(02 12 03 13) + movq mm2,mm1 ; transpose coefficients(phase 1) + punpcklwd mm1,mm3 ; mm1=(04 14 05 15) + punpckhwd mm2,mm3 ; mm2=(06 16 07 17) + + movq mm7,mm6 ; transpose coefficients(phase 2) + punpckldq mm6,mm0 ; mm6=(00 10 20 30)=data0 + punpckhdq mm7,mm0 ; mm7=(01 11 21 31)=data1 + movq mm3,mm2 ; transpose coefficients(phase 2) + punpckldq mm2,mm5 ; mm2=(06 16 26 36)=data6 + punpckhdq mm3,mm5 ; mm3=(07 17 27 37)=data7 + + movq mm0,mm7 + movq mm5,mm6 + psubw mm7,mm2 ; mm7=data1-data6=tmp6 + psubw mm6,mm3 ; mm6=data0-data7=tmp7 + paddw mm0,mm2 ; mm0=data1+data6=tmp1 + paddw mm5,mm3 ; mm5=data0+data7=tmp0 + + movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33) + movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35) + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 + + movq mm7,mm4 ; transpose coefficients(phase 2) + punpckldq mm4,mm2 ; mm4=(02 12 22 32)=data2 + punpckhdq mm7,mm2 ; mm7=(03 13 23 33)=data3 + movq mm6,mm1 ; transpose coefficients(phase 2) + punpckldq mm1,mm3 ; mm1=(04 14 24 34)=data4 + punpckhdq mm6,mm3 ; mm6=(05 15 25 35)=data5 + + movq mm2,mm7 + movq mm3,mm4 + paddw mm7,mm1 ; mm7=data3+data4=tmp3 + paddw mm4,mm6 ; mm4=data2+data5=tmp2 + psubw mm2,mm1 ; mm2=data3-data4=tmp4 + psubw mm3,mm6 ; mm3=data2-data5=tmp5 + + ; -- Even part + + movq mm1,mm5 + movq mm6,mm0 + psubw mm5,mm7 ; mm5=tmp13 + psubw mm0,mm4 ; mm0=tmp12 + paddw mm1,mm7 ; mm1=tmp10 + paddw mm6,mm4 ; mm6=tmp11 + + paddw mm0,mm5 + psllw mm0,PRE_MULTIPLY_SCALE_BITS + pmulhw mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1 + + movq mm7,mm1 + movq mm4,mm5 + psubw mm1,mm6 ; mm1=data4 + psubw mm5,mm0 ; mm5=data6 + paddw mm7,mm6 ; mm7=data0 + paddw mm4,mm0 ; mm4=data2 + + movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7 + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 + + ; -- Odd part + + movq mm6, MMWORD [wk(0)] ; mm6=tmp6 + movq mm0, MMWORD [wk(1)] ; mm0=tmp7 + + paddw mm2,mm3 ; mm2=tmp10 + paddw mm3,mm6 ; mm3=tmp11 + paddw mm6,mm0 ; mm6=tmp12, mm0=tmp7 + + psllw mm2,PRE_MULTIPLY_SCALE_BITS + psllw mm6,PRE_MULTIPLY_SCALE_BITS + + psllw mm3,PRE_MULTIPLY_SCALE_BITS + pmulhw mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3 + + movq mm1,mm2 ; mm1=tmp10 + psubw mm2,mm6 + pmulhw mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5 + pmulhw mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610) + pmulhw mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296) + paddw mm1,mm2 ; mm1=z2 + paddw mm6,mm2 ; mm6=z4 + + movq mm5,mm0 + psubw mm0,mm3 ; mm0=z13 + paddw mm5,mm3 ; mm5=z11 + + movq mm7,mm0 + movq mm4,mm5 + psubw mm0,mm1 ; mm0=data3 + psubw mm5,mm6 ; mm5=data7 + paddw mm7,mm1 ; mm7=data5 + paddw mm4,mm6 ; mm4=data1 + + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4 + + add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec ecx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16,7 +.columnloop: + + movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)] + + ; mm0=(02 12 22 32), mm2=(42 52 62 72) + ; mm1=(03 13 23 33), mm3=(43 53 63 73) + + movq mm4,mm0 ; transpose coefficients(phase 1) + punpcklwd mm0,mm1 ; mm0=(02 03 12 13) + punpckhwd mm4,mm1 ; mm4=(22 23 32 33) + movq mm5,mm2 ; transpose coefficients(phase 1) + punpcklwd mm2,mm3 ; mm2=(42 43 52 53) + punpckhwd mm5,mm3 ; mm5=(62 63 72 73) + + movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)] + + ; mm6=(00 10 20 30), mm1=(40 50 60 70) + ; mm7=(01 11 21 31), mm3=(41 51 61 71) + + movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33) + movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53) + + movq mm4,mm6 ; transpose coefficients(phase 1) + punpcklwd mm6,mm7 ; mm6=(00 01 10 11) + punpckhwd mm4,mm7 ; mm4=(20 21 30 31) + movq mm2,mm1 ; transpose coefficients(phase 1) + punpcklwd mm1,mm3 ; mm1=(40 41 50 51) + punpckhwd mm2,mm3 ; mm2=(60 61 70 71) + + movq mm7,mm6 ; transpose coefficients(phase 2) + punpckldq mm6,mm0 ; mm6=(00 01 02 03)=data0 + punpckhdq mm7,mm0 ; mm7=(10 11 12 13)=data1 + movq mm3,mm2 ; transpose coefficients(phase 2) + punpckldq mm2,mm5 ; mm2=(60 61 62 63)=data6 + punpckhdq mm3,mm5 ; mm3=(70 71 72 73)=data7 + + movq mm0,mm7 + movq mm5,mm6 + psubw mm7,mm2 ; mm7=data1-data6=tmp6 + psubw mm6,mm3 ; mm6=data0-data7=tmp7 + paddw mm0,mm2 ; mm0=data1+data6=tmp1 + paddw mm5,mm3 ; mm5=data0+data7=tmp0 + + movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33) + movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53) + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 + + movq mm7,mm4 ; transpose coefficients(phase 2) + punpckldq mm4,mm2 ; mm4=(20 21 22 23)=data2 + punpckhdq mm7,mm2 ; mm7=(30 31 32 33)=data3 + movq mm6,mm1 ; transpose coefficients(phase 2) + punpckldq mm1,mm3 ; mm1=(40 41 42 43)=data4 + punpckhdq mm6,mm3 ; mm6=(50 51 52 53)=data5 + + movq mm2,mm7 + movq mm3,mm4 + paddw mm7,mm1 ; mm7=data3+data4=tmp3 + paddw mm4,mm6 ; mm4=data2+data5=tmp2 + psubw mm2,mm1 ; mm2=data3-data4=tmp4 + psubw mm3,mm6 ; mm3=data2-data5=tmp5 + + ; -- Even part + + movq mm1,mm5 + movq mm6,mm0 + psubw mm5,mm7 ; mm5=tmp13 + psubw mm0,mm4 ; mm0=tmp12 + paddw mm1,mm7 ; mm1=tmp10 + paddw mm6,mm4 ; mm6=tmp11 + + paddw mm0,mm5 + psllw mm0,PRE_MULTIPLY_SCALE_BITS + pmulhw mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1 + + movq mm7,mm1 + movq mm4,mm5 + psubw mm1,mm6 ; mm1=data4 + psubw mm5,mm0 ; mm5=data6 + paddw mm7,mm6 ; mm7=data0 + paddw mm4,mm0 ; mm4=data2 + + movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7 + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 + + ; -- Odd part + + movq mm6, MMWORD [wk(0)] ; mm6=tmp6 + movq mm0, MMWORD [wk(1)] ; mm0=tmp7 + + paddw mm2,mm3 ; mm2=tmp10 + paddw mm3,mm6 ; mm3=tmp11 + paddw mm6,mm0 ; mm6=tmp12, mm0=tmp7 + + psllw mm2,PRE_MULTIPLY_SCALE_BITS + psllw mm6,PRE_MULTIPLY_SCALE_BITS + + psllw mm3,PRE_MULTIPLY_SCALE_BITS + pmulhw mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3 + + movq mm1,mm2 ; mm1=tmp10 + psubw mm2,mm6 + pmulhw mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5 + pmulhw mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610) + pmulhw mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296) + paddw mm1,mm2 ; mm1=z2 + paddw mm6,mm2 ; mm6=z4 + + movq mm5,mm0 + psubw mm0,mm3 ; mm0=z13 + paddw mm5,mm3 ; mm5=z11 + + movq mm7,mm0 + movq mm4,mm5 + psubw mm0,mm1 ; mm0=data3 + psubw mm5,mm6 ; mm5=data7 + paddw mm7,mm1 ; mm7=data5 + paddw mm4,mm6 ; mm4=data1 + + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4 + + add edx, byte 4*SIZEOF_DCTELEM + dec ecx + jnz near .columnloop + + emms ; empty MMX state + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jfdctfst-sse2-64.asm b/media/libjpeg/simd/jfdctfst-sse2-64.asm new file mode 100644 index 000000000..4c9668542 --- /dev/null +++ b/media/libjpeg/simd/jfdctfst-sse2-64.asm @@ -0,0 +1,391 @@ +; +; jfdctfst.asm - fast integer FDCT (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a fast, not so accurate integer implementation of +; the forward DCT (Discrete Cosine Transform). The following code is +; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c +; for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 8 ; 14 is also OK. + +%if CONST_BITS == 8 +F_0_382 equ 98 ; FIX(0.382683433) +F_0_541 equ 139 ; FIX(0.541196100) +F_0_707 equ 181 ; FIX(0.707106781) +F_1_306 equ 334 ; FIX(1.306562965) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781) +F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 16 + global EXTN(jconst_fdct_ifast_sse2) + +EXTN(jconst_fdct_ifast_sse2): + +PW_F0707 times 8 dw F_0_707 << CONST_SHIFT +PW_F0382 times 8 dw F_0_382 << CONST_SHIFT +PW_F0541 times 8 dw F_0_541 << CONST_SHIFT +PW_F1306 times 8 dw F_1_306 << CONST_SHIFT + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_ifast_sse2 (DCTELEM *data) +; + +; r10 = DCTELEM *data + +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + global EXTN(jsimd_fdct_ifast_sse2) + +EXTN(jsimd_fdct_ifast_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + + ; ---- Pass 1: process rows. + + mov rdx, r10 ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75) + + movdqa xmm7,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6,xmm1 + movdqa xmm3,xmm0 + psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2,xmm1 + movdqa xmm5,xmm7 + paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4,xmm3 + movdqa xmm0,xmm6 + psubw xmm3,xmm1 ; xmm3=tmp13 + psubw xmm6,xmm7 ; xmm6=tmp12 + paddw xmm4,xmm1 ; xmm4=tmp10 + paddw xmm0,xmm7 ; xmm0=tmp11 + + paddw xmm6,xmm3 + psllw xmm6,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm6,[rel PW_F0707] ; xmm6=z1 + + movdqa xmm1,xmm4 + movdqa xmm7,xmm3 + psubw xmm4,xmm0 ; xmm4=data4 + psubw xmm3,xmm6 ; xmm3=data6 + paddw xmm1,xmm0 ; xmm1=data0 + paddw xmm7,xmm6 ; xmm7=data2 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7 + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4 + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6 + + ; -- Odd part + + paddw xmm2,xmm5 ; xmm2=tmp10 + paddw xmm5,xmm0 ; xmm5=tmp11 + paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7 + + psllw xmm2,PRE_MULTIPLY_SCALE_BITS + psllw xmm0,PRE_MULTIPLY_SCALE_BITS + + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5,[rel PW_F0707] ; xmm5=z3 + + movdqa xmm4,xmm2 ; xmm4=tmp10 + psubw xmm2,xmm0 + pmulhw xmm2,[rel PW_F0382] ; xmm2=z5 + pmulhw xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm0,[rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4,xmm2 ; xmm4=z2 + paddw xmm0,xmm2 ; xmm0=z4 + + movdqa xmm3,xmm6 + psubw xmm6,xmm5 ; xmm6=z13 + paddw xmm3,xmm5 ; xmm3=z11 + + movdqa xmm2,xmm6 + movdqa xmm5,xmm3 + psubw xmm6,xmm4 ; xmm6=data3 + psubw xmm3,xmm0 ; xmm3=data7 + paddw xmm2,xmm4 ; xmm2=data5 + paddw xmm5,xmm0 ; xmm5=data1 + + ; ---- Pass 2: process columns. + + ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) + ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) + + movdqa xmm4,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31) + punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71) + movdqa xmm0,xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33) + punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6 + + ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76) + ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm7,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35) + punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75) + movdqa xmm0,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37) + punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77) + + movdqa xmm2,xmm5 ; transpose coefficients(phase 2) + punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17) + punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37) + movdqa xmm3,xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57) + punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77) + + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57) + + movdqa xmm2,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13) + punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33) + movdqa xmm7,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53) + punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73) + + movdqa xmm6,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm0,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm5,xmm6 + movdqa xmm3,xmm1 + psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6 + psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7 + paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1 + paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0 + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7 + + movdqa xmm6,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm1,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm7,xmm6 + movdqa xmm0,xmm2 + paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3 + paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2 + psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4 + psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4,xmm3 + movdqa xmm1,xmm5 + psubw xmm3,xmm6 ; xmm3=tmp13 + psubw xmm5,xmm2 ; xmm5=tmp12 + paddw xmm4,xmm6 ; xmm4=tmp10 + paddw xmm1,xmm2 ; xmm1=tmp11 + + paddw xmm5,xmm3 + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5,[rel PW_F0707] ; xmm5=z1 + + movdqa xmm6,xmm4 + movdqa xmm2,xmm3 + psubw xmm4,xmm1 ; xmm4=data4 + psubw xmm3,xmm5 ; xmm3=data6 + paddw xmm6,xmm1 ; xmm6=data0 + paddw xmm2,xmm5 ; xmm2=data2 + + movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2 + + ; -- Odd part + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + paddw xmm7,xmm0 ; xmm7=tmp10 + paddw xmm0,xmm1 ; xmm0=tmp11 + paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7 + + psllw xmm7,PRE_MULTIPLY_SCALE_BITS + psllw xmm1,PRE_MULTIPLY_SCALE_BITS + + psllw xmm0,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm0,[rel PW_F0707] ; xmm0=z3 + + movdqa xmm4,xmm7 ; xmm4=tmp10 + psubw xmm7,xmm1 + pmulhw xmm7,[rel PW_F0382] ; xmm7=z5 + pmulhw xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm1,[rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4,xmm7 ; xmm4=z2 + paddw xmm1,xmm7 ; xmm1=z4 + + movdqa xmm3,xmm5 + psubw xmm5,xmm0 ; xmm5=z13 + paddw xmm3,xmm0 ; xmm3=z11 + + movdqa xmm6,xmm5 + movdqa xmm2,xmm3 + psubw xmm5,xmm4 ; xmm5=data3 + psubw xmm3,xmm1 ; xmm3=data7 + paddw xmm6,xmm4 ; xmm6=data5 + paddw xmm2,xmm1 ; xmm2=data1 + + movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5 + movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2 + + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jfdctfst-sse2.asm b/media/libjpeg/simd/jfdctfst-sse2.asm new file mode 100644 index 000000000..54856a236 --- /dev/null +++ b/media/libjpeg/simd/jfdctfst-sse2.asm @@ -0,0 +1,403 @@ +; +; jfdctfst.asm - fast integer FDCT (SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a fast, not so accurate integer implementation of +; the forward DCT (Discrete Cosine Transform). The following code is +; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c +; for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 8 ; 14 is also OK. + +%if CONST_BITS == 8 +F_0_382 equ 98 ; FIX(0.382683433) +F_0_541 equ 139 ; FIX(0.541196100) +F_0_707 equ 181 ; FIX(0.707106781) +F_1_306 equ 334 ; FIX(1.306562965) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781) +F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 16 + global EXTN(jconst_fdct_ifast_sse2) + +EXTN(jconst_fdct_ifast_sse2): + +PW_F0707 times 8 dw F_0_707 << CONST_SHIFT +PW_F0382 times 8 dw F_0_382 << CONST_SHIFT +PW_F0541 times 8 dw F_0_541 << CONST_SHIFT +PW_F1306 times 8 dw F_1_306 << CONST_SHIFT + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_ifast_sse2 (DCTELEM *data) +; + +%define data(b) (b)+8 ; DCTELEM *data + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + global EXTN(jsimd_fdct_ifast_sse2) + +EXTN(jsimd_fdct_ifast_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75) + + movdqa xmm7,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6,xmm1 + movdqa xmm3,xmm0 + psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2,xmm1 + movdqa xmm5,xmm7 + paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4,xmm3 + movdqa xmm0,xmm6 + psubw xmm3,xmm1 ; xmm3=tmp13 + psubw xmm6,xmm7 ; xmm6=tmp12 + paddw xmm4,xmm1 ; xmm4=tmp10 + paddw xmm0,xmm7 ; xmm0=tmp11 + + paddw xmm6,xmm3 + psllw xmm6,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1 + + movdqa xmm1,xmm4 + movdqa xmm7,xmm3 + psubw xmm4,xmm0 ; xmm4=data4 + psubw xmm3,xmm6 ; xmm3=data6 + paddw xmm1,xmm0 ; xmm1=data0 + paddw xmm7,xmm6 ; xmm7=data2 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7 + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4 + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6 + + ; -- Odd part + + paddw xmm2,xmm5 ; xmm2=tmp10 + paddw xmm5,xmm0 ; xmm5=tmp11 + paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7 + + psllw xmm2,PRE_MULTIPLY_SCALE_BITS + psllw xmm0,PRE_MULTIPLY_SCALE_BITS + + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3 + + movdqa xmm4,xmm2 ; xmm4=tmp10 + psubw xmm2,xmm0 + pmulhw xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5 + pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4,xmm2 ; xmm4=z2 + paddw xmm0,xmm2 ; xmm0=z4 + + movdqa xmm3,xmm6 + psubw xmm6,xmm5 ; xmm6=z13 + paddw xmm3,xmm5 ; xmm3=z11 + + movdqa xmm2,xmm6 + movdqa xmm5,xmm3 + psubw xmm6,xmm4 ; xmm6=data3 + psubw xmm3,xmm0 ; xmm3=data7 + paddw xmm2,xmm4 ; xmm2=data5 + paddw xmm5,xmm0 ; xmm5=data1 + + ; ---- Pass 2: process columns. + +; mov edx, POINTER [data(eax)] ; (DCTELEM *) + + ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) + ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) + + movdqa xmm4,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31) + punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71) + movdqa xmm0,xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33) + punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6 + + ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76) + ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm7,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35) + punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75) + movdqa xmm0,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37) + punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77) + + movdqa xmm2,xmm5 ; transpose coefficients(phase 2) + punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17) + punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37) + movdqa xmm3,xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57) + punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77) + + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57) + + movdqa xmm2,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13) + punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33) + movdqa xmm7,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53) + punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73) + + movdqa xmm6,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm0,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm5,xmm6 + movdqa xmm3,xmm1 + psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6 + psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7 + paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1 + paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0 + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7 + + movdqa xmm6,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm1,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm7,xmm6 + movdqa xmm0,xmm2 + paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3 + paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2 + psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4 + psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4,xmm3 + movdqa xmm1,xmm5 + psubw xmm3,xmm6 ; xmm3=tmp13 + psubw xmm5,xmm2 ; xmm5=tmp12 + paddw xmm4,xmm6 ; xmm4=tmp10 + paddw xmm1,xmm2 ; xmm1=tmp11 + + paddw xmm5,xmm3 + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1 + + movdqa xmm6,xmm4 + movdqa xmm2,xmm3 + psubw xmm4,xmm1 ; xmm4=data4 + psubw xmm3,xmm5 ; xmm3=data6 + paddw xmm6,xmm1 ; xmm6=data0 + paddw xmm2,xmm5 ; xmm2=data2 + + movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2 + + ; -- Odd part + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + paddw xmm7,xmm0 ; xmm7=tmp10 + paddw xmm0,xmm1 ; xmm0=tmp11 + paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7 + + psllw xmm7,PRE_MULTIPLY_SCALE_BITS + psllw xmm1,PRE_MULTIPLY_SCALE_BITS + + psllw xmm0,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3 + + movdqa xmm4,xmm7 ; xmm4=tmp10 + psubw xmm7,xmm1 + pmulhw xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5 + pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4,xmm7 ; xmm4=z2 + paddw xmm1,xmm7 ; xmm1=z4 + + movdqa xmm3,xmm5 + psubw xmm5,xmm0 ; xmm5=z13 + paddw xmm3,xmm0 ; xmm3=z11 + + movdqa xmm6,xmm5 + movdqa xmm2,xmm3 + psubw xmm5,xmm4 ; xmm5=data3 + psubw xmm3,xmm1 ; xmm3=data7 + paddw xmm6,xmm4 ; xmm6=data5 + paddw xmm2,xmm1 ; xmm2=data1 + + movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5 + movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2 + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jfdctint-altivec.c b/media/libjpeg/simd/jfdctint-altivec.c new file mode 100644 index 000000000..e6e8a5687 --- /dev/null +++ b/media/libjpeg/simd/jfdctint-altivec.c @@ -0,0 +1,262 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* SLOW INTEGER FORWARD DCT */ + +#include "jsimd_altivec.h" + + +#define F_0_298 2446 /* FIX(0.298631336) */ +#define F_0_390 3196 /* FIX(0.390180644) */ +#define F_0_541 4433 /* FIX(0.541196100) */ +#define F_0_765 6270 /* FIX(0.765366865) */ +#define F_0_899 7373 /* FIX(0.899976223) */ +#define F_1_175 9633 /* FIX(1.175875602) */ +#define F_1_501 12299 /* FIX(1.501321110) */ +#define F_1_847 15137 /* FIX(1.847759065) */ +#define F_1_961 16069 /* FIX(1.961570560) */ +#define F_2_053 16819 /* FIX(2.053119869) */ +#define F_2_562 20995 /* FIX(2.562915447) */ +#define F_3_072 25172 /* FIX(3.072711026) */ + +#define CONST_BITS 13 +#define PASS1_BITS 2 +#define DESCALE_P1 (CONST_BITS - PASS1_BITS) +#define DESCALE_P2 (CONST_BITS + PASS1_BITS) + + +#define DO_FDCT_COMMON(PASS) \ +{ \ + /* (Original) \ + * z1 = (tmp12 + tmp13) * 0.541196100; \ + * data2 = z1 + tmp13 * 0.765366865; \ + * data6 = z1 + tmp12 * -1.847759065; \ + * \ + * (This implementation) \ + * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \ + * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \ + */ \ + \ + tmp1312l = vec_mergeh(tmp13, tmp12); \ + tmp1312h = vec_mergel(tmp13, tmp12); \ + \ + out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS); \ + out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS); \ + out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS); \ + out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS); \ + \ + out2l = vec_sra(out2l, descale_p##PASS); \ + out2h = vec_sra(out2h, descale_p##PASS); \ + out6l = vec_sra(out6l, descale_p##PASS); \ + out6h = vec_sra(out6h, descale_p##PASS); \ + \ + out2 = vec_pack(out2l, out2h); \ + out6 = vec_pack(out6l, out6h); \ + \ + /* Odd part */ \ + \ + z3 = vec_add(tmp4, tmp6); \ + z4 = vec_add(tmp5, tmp7); \ + \ + /* (Original) \ + * z5 = (z3 + z4) * 1.175875602; \ + * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \ + * z3 += z5; z4 += z5; \ + * \ + * (This implementation) \ + * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \ + * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \ + */ \ + \ + z34l = vec_mergeh(z3, z4); \ + z34h = vec_mergel(z3, z4); \ + \ + z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS); \ + z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS); \ + z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS); \ + z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS); \ + \ + /* (Original) \ + * z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \ + * tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \ + * tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \ + * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \ + * data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; \ + * data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; \ + * \ + * (This implementation) \ + * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \ + * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \ + * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \ + * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \ + * data7 = tmp4 + z3; data5 = tmp5 + z4; \ + * data3 = tmp6 + z3; data1 = tmp7 + z4; \ + */ \ + \ + tmp47l = vec_mergeh(tmp4, tmp7); \ + tmp47h = vec_mergel(tmp4, tmp7); \ + \ + out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l); \ + out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h); \ + out1l = vec_msums(tmp47l, pw_mf089_f060, z4l); \ + out1h = vec_msums(tmp47h, pw_mf089_f060, z4h); \ + \ + out7l = vec_sra(out7l, descale_p##PASS); \ + out7h = vec_sra(out7h, descale_p##PASS); \ + out1l = vec_sra(out1l, descale_p##PASS); \ + out1h = vec_sra(out1h, descale_p##PASS); \ + \ + out7 = vec_pack(out7l, out7h); \ + out1 = vec_pack(out1l, out1h); \ + \ + tmp56l = vec_mergeh(tmp5, tmp6); \ + tmp56h = vec_mergel(tmp5, tmp6); \ + \ + out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l); \ + out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h); \ + out3l = vec_msums(tmp56l, pw_mf256_f050, z3l); \ + out3h = vec_msums(tmp56h, pw_mf256_f050, z3h); \ + \ + out5l = vec_sra(out5l, descale_p##PASS); \ + out5h = vec_sra(out5h, descale_p##PASS); \ + out3l = vec_sra(out3l, descale_p##PASS); \ + out3h = vec_sra(out3h, descale_p##PASS); \ + \ + out5 = vec_pack(out5l, out5h); \ + out3 = vec_pack(out3l, out3h); \ +} + +#define DO_FDCT_PASS1() \ +{ \ + /* Even part */ \ + \ + tmp10 = vec_add(tmp0, tmp3); \ + tmp13 = vec_sub(tmp0, tmp3); \ + tmp11 = vec_add(tmp1, tmp2); \ + tmp12 = vec_sub(tmp1, tmp2); \ + \ + out0 = vec_add(tmp10, tmp11); \ + out0 = vec_sl(out0, pass1_bits); \ + out4 = vec_sub(tmp10, tmp11); \ + out4 = vec_sl(out4, pass1_bits); \ + \ + DO_FDCT_COMMON(1); \ +} + +#define DO_FDCT_PASS2() \ +{ \ + /* Even part */ \ + \ + tmp10 = vec_add(tmp0, tmp3); \ + tmp13 = vec_sub(tmp0, tmp3); \ + tmp11 = vec_add(tmp1, tmp2); \ + tmp12 = vec_sub(tmp1, tmp2); \ + \ + out0 = vec_add(tmp10, tmp11); \ + out0 = vec_add(out0, pw_descale_p2x); \ + out0 = vec_sra(out0, pass1_bits); \ + out4 = vec_sub(tmp10, tmp11); \ + out4 = vec_add(out4, pw_descale_p2x); \ + out4 = vec_sra(out4, pass1_bits); \ + \ + DO_FDCT_COMMON(2); \ +} + + +void +jsimd_fdct_islow_altivec (DCTELEM *data) +{ + __vector short row0, row1, row2, row3, row4, row5, row6, row7, + col0, col1, col2, col3, col4, col5, col6, col7, + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, + tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h, + z3, z4, z34l, z34h, + out0, out1, out2, out3, out4, out5, out6, out7; + __vector int z3l, z3h, z4l, z4h, + out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h, + out7l, out7h; + + /* Constants */ + __vector short + pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) }, + pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) }, + pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) }, + pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) }, + pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) }, + pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) }, + pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) }, + pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) }, + pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) }; + __vector unsigned short pass1_bits = { __8X(PASS1_BITS) }; + __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) }, + pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) }; + __vector unsigned int descale_p1 = { __4X(DESCALE_P1) }, + descale_p2 = { __4X(DESCALE_P2) }; + + /* Pass 1: process rows */ + + row0 = vec_ld(0, data); + row1 = vec_ld(16, data); + row2 = vec_ld(32, data); + row3 = vec_ld(48, data); + row4 = vec_ld(64, data); + row5 = vec_ld(80, data); + row6 = vec_ld(96, data); + row7 = vec_ld(112, data); + + TRANSPOSE(row, col); + + tmp0 = vec_add(col0, col7); + tmp7 = vec_sub(col0, col7); + tmp1 = vec_add(col1, col6); + tmp6 = vec_sub(col1, col6); + tmp2 = vec_add(col2, col5); + tmp5 = vec_sub(col2, col5); + tmp3 = vec_add(col3, col4); + tmp4 = vec_sub(col3, col4); + + DO_FDCT_PASS1(); + + /* Pass 2: process columns */ + + TRANSPOSE(out, row); + + tmp0 = vec_add(row0, row7); + tmp7 = vec_sub(row0, row7); + tmp1 = vec_add(row1, row6); + tmp6 = vec_sub(row1, row6); + tmp2 = vec_add(row2, row5); + tmp5 = vec_sub(row2, row5); + tmp3 = vec_add(row3, row4); + tmp4 = vec_sub(row3, row4); + + DO_FDCT_PASS2(); + + vec_st(out0, 0, data); + vec_st(out1, 16, data); + vec_st(out2, 32, data); + vec_st(out3, 48, data); + vec_st(out4, 64, data); + vec_st(out5, 80, data); + vec_st(out6, 96, data); + vec_st(out7, 112, data); +} diff --git a/media/libjpeg/simd/jfdctint-mmx.asm b/media/libjpeg/simd/jfdctint-mmx.asm new file mode 100644 index 000000000..9142ad881 --- /dev/null +++ b/media/libjpeg/simd/jfdctint-mmx.asm @@ -0,0 +1,621 @@ +; +; jfdctint.asm - accurate integer FDCT (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slow-but-accurate integer implementation of the +; forward DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jfdctint.c; see the jfdctint.c for +; more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS-PASS1_BITS) +%define DESCALE_P2 (CONST_BITS+PASS1_BITS) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_fdct_islow_mmx) + +EXTN(jconst_fdct_islow_mmx): + +PW_F130_F054 times 2 dw (F_0_541+F_0_765), F_0_541 +PW_F054_MF130 times 2 dw F_0_541, (F_0_541-F_1_847) +PW_MF078_F117 times 2 dw (F_1_175-F_1_961), F_1_175 +PW_F117_F078 times 2 dw F_1_175, (F_1_175-F_0_390) +PW_MF060_MF089 times 2 dw (F_0_298-F_0_899),-F_0_899 +PW_MF089_F060 times 2 dw -F_0_899, (F_1_501-F_0_899) +PW_MF050_MF256 times 2 dw (F_2_053-F_2_562),-F_2_562 +PW_MF256_F050 times 2 dw -F_2_562, (F_3_072-F_2_562) +PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1-1) +PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2-1) +PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS-1) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_islow_mmx (DCTELEM *data) +; + +%define data(b) (b)+8 ; DCTELEM *data + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + global EXTN(jsimd_fdct_islow_mmx) + +EXTN(jsimd_fdct_islow_mmx): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16,7 +.rowloop: + + movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)] + + ; mm0=(20 21 22 23), mm2=(24 25 26 27) + ; mm1=(30 31 32 33), mm3=(34 35 36 37) + + movq mm4,mm0 ; transpose coefficients(phase 1) + punpcklwd mm0,mm1 ; mm0=(20 30 21 31) + punpckhwd mm4,mm1 ; mm4=(22 32 23 33) + movq mm5,mm2 ; transpose coefficients(phase 1) + punpcklwd mm2,mm3 ; mm2=(24 34 25 35) + punpckhwd mm5,mm3 ; mm5=(26 36 27 37) + + movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)] + + ; mm6=(00 01 02 03), mm1=(04 05 06 07) + ; mm7=(10 11 12 13), mm3=(14 15 16 17) + + movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33) + movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35) + + movq mm4,mm6 ; transpose coefficients(phase 1) + punpcklwd mm6,mm7 ; mm6=(00 10 01 11) + punpckhwd mm4,mm7 ; mm4=(02 12 03 13) + movq mm2,mm1 ; transpose coefficients(phase 1) + punpcklwd mm1,mm3 ; mm1=(04 14 05 15) + punpckhwd mm2,mm3 ; mm2=(06 16 07 17) + + movq mm7,mm6 ; transpose coefficients(phase 2) + punpckldq mm6,mm0 ; mm6=(00 10 20 30)=data0 + punpckhdq mm7,mm0 ; mm7=(01 11 21 31)=data1 + movq mm3,mm2 ; transpose coefficients(phase 2) + punpckldq mm2,mm5 ; mm2=(06 16 26 36)=data6 + punpckhdq mm3,mm5 ; mm3=(07 17 27 37)=data7 + + movq mm0,mm7 + movq mm5,mm6 + psubw mm7,mm2 ; mm7=data1-data6=tmp6 + psubw mm6,mm3 ; mm6=data0-data7=tmp7 + paddw mm0,mm2 ; mm0=data1+data6=tmp1 + paddw mm5,mm3 ; mm5=data0+data7=tmp0 + + movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33) + movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35) + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 + + movq mm7,mm4 ; transpose coefficients(phase 2) + punpckldq mm4,mm2 ; mm4=(02 12 22 32)=data2 + punpckhdq mm7,mm2 ; mm7=(03 13 23 33)=data3 + movq mm6,mm1 ; transpose coefficients(phase 2) + punpckldq mm1,mm3 ; mm1=(04 14 24 34)=data4 + punpckhdq mm6,mm3 ; mm6=(05 15 25 35)=data5 + + movq mm2,mm7 + movq mm3,mm4 + paddw mm7,mm1 ; mm7=data3+data4=tmp3 + paddw mm4,mm6 ; mm4=data2+data5=tmp2 + psubw mm2,mm1 ; mm2=data3-data4=tmp4 + psubw mm3,mm6 ; mm3=data2-data5=tmp5 + + ; -- Even part + + movq mm1,mm5 + movq mm6,mm0 + paddw mm5,mm7 ; mm5=tmp10 + paddw mm0,mm4 ; mm0=tmp11 + psubw mm1,mm7 ; mm1=tmp13 + psubw mm6,mm4 ; mm6=tmp12 + + movq mm7,mm5 + paddw mm5,mm0 ; mm5=tmp10+tmp11 + psubw mm7,mm0 ; mm7=tmp10-tmp11 + + psllw mm5,PASS1_BITS ; mm5=data0 + psllw mm7,PASS1_BITS ; mm7=data4 + + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movq mm4,mm1 ; mm1=tmp13 + movq mm0,mm1 + punpcklwd mm4,mm6 ; mm6=tmp12 + punpckhwd mm0,mm6 + movq mm1,mm4 + movq mm6,mm0 + pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L + pmaddwd mm0,[GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H + pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L + pmaddwd mm6,[GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H + + paddd mm4,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm0,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm4,DESCALE_P1 + psrad mm0,DESCALE_P1 + paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm6,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm1,DESCALE_P1 + psrad mm6,DESCALE_P1 + + packssdw mm4,mm0 ; mm4=data2 + packssdw mm1,mm6 ; mm1=data6 + + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1 + + ; -- Odd part + + movq mm5, MMWORD [wk(0)] ; mm5=tmp6 + movq mm7, MMWORD [wk(1)] ; mm7=tmp7 + + movq mm0,mm2 ; mm2=tmp4 + movq mm6,mm3 ; mm3=tmp5 + paddw mm0,mm5 ; mm0=z3 + paddw mm6,mm7 ; mm6=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movq mm4,mm0 + movq mm1,mm0 + punpcklwd mm4,mm6 + punpckhwd mm1,mm6 + movq mm0,mm4 + movq mm6,mm1 + pmaddwd mm4,[GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L + pmaddwd mm1,[GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H + pmaddwd mm0,[GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L + pmaddwd mm6,[GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H + + movq MMWORD [wk(0)], mm4 ; wk(0)=z3L + movq MMWORD [wk(1)], mm1 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movq mm4,mm2 + movq mm1,mm2 + punpcklwd mm4,mm7 + punpckhwd mm1,mm7 + movq mm2,mm4 + movq mm7,mm1 + pmaddwd mm4,[GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L + pmaddwd mm1,[GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H + pmaddwd mm2,[GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L + pmaddwd mm7,[GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H + + paddd mm4, MMWORD [wk(0)] ; mm4=data7L + paddd mm1, MMWORD [wk(1)] ; mm1=data7H + paddd mm2,mm0 ; mm2=data1L + paddd mm7,mm6 ; mm7=data1H + + paddd mm4,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm4,DESCALE_P1 + psrad mm1,DESCALE_P1 + paddd mm2,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm7,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm2,DESCALE_P1 + psrad mm7,DESCALE_P1 + + packssdw mm4,mm1 ; mm4=data7 + packssdw mm2,mm7 ; mm2=data1 + + movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2 + + movq mm1,mm3 + movq mm7,mm3 + punpcklwd mm1,mm5 + punpckhwd mm7,mm5 + movq mm3,mm1 + movq mm5,mm7 + pmaddwd mm1,[GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L + pmaddwd mm7,[GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H + pmaddwd mm3,[GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L + pmaddwd mm5,[GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H + + paddd mm1,mm0 ; mm1=data5L + paddd mm7,mm6 ; mm7=data5H + paddd mm3, MMWORD [wk(0)] ; mm3=data3L + paddd mm5, MMWORD [wk(1)] ; mm5=data3H + + paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm7,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm1,DESCALE_P1 + psrad mm7,DESCALE_P1 + paddd mm3,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm5,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm3,DESCALE_P1 + psrad mm5,DESCALE_P1 + + packssdw mm1,mm7 ; mm1=data5 + packssdw mm3,mm5 ; mm3=data3 + + movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3 + + add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec ecx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16,7 +.columnloop: + + movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)] + + ; mm0=(02 12 22 32), mm2=(42 52 62 72) + ; mm1=(03 13 23 33), mm3=(43 53 63 73) + + movq mm4,mm0 ; transpose coefficients(phase 1) + punpcklwd mm0,mm1 ; mm0=(02 03 12 13) + punpckhwd mm4,mm1 ; mm4=(22 23 32 33) + movq mm5,mm2 ; transpose coefficients(phase 1) + punpcklwd mm2,mm3 ; mm2=(42 43 52 53) + punpckhwd mm5,mm3 ; mm5=(62 63 72 73) + + movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)] + + ; mm6=(00 10 20 30), mm1=(40 50 60 70) + ; mm7=(01 11 21 31), mm3=(41 51 61 71) + + movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33) + movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53) + + movq mm4,mm6 ; transpose coefficients(phase 1) + punpcklwd mm6,mm7 ; mm6=(00 01 10 11) + punpckhwd mm4,mm7 ; mm4=(20 21 30 31) + movq mm2,mm1 ; transpose coefficients(phase 1) + punpcklwd mm1,mm3 ; mm1=(40 41 50 51) + punpckhwd mm2,mm3 ; mm2=(60 61 70 71) + + movq mm7,mm6 ; transpose coefficients(phase 2) + punpckldq mm6,mm0 ; mm6=(00 01 02 03)=data0 + punpckhdq mm7,mm0 ; mm7=(10 11 12 13)=data1 + movq mm3,mm2 ; transpose coefficients(phase 2) + punpckldq mm2,mm5 ; mm2=(60 61 62 63)=data6 + punpckhdq mm3,mm5 ; mm3=(70 71 72 73)=data7 + + movq mm0,mm7 + movq mm5,mm6 + psubw mm7,mm2 ; mm7=data1-data6=tmp6 + psubw mm6,mm3 ; mm6=data0-data7=tmp7 + paddw mm0,mm2 ; mm0=data1+data6=tmp1 + paddw mm5,mm3 ; mm5=data0+data7=tmp0 + + movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33) + movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53) + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 + + movq mm7,mm4 ; transpose coefficients(phase 2) + punpckldq mm4,mm2 ; mm4=(20 21 22 23)=data2 + punpckhdq mm7,mm2 ; mm7=(30 31 32 33)=data3 + movq mm6,mm1 ; transpose coefficients(phase 2) + punpckldq mm1,mm3 ; mm1=(40 41 42 43)=data4 + punpckhdq mm6,mm3 ; mm6=(50 51 52 53)=data5 + + movq mm2,mm7 + movq mm3,mm4 + paddw mm7,mm1 ; mm7=data3+data4=tmp3 + paddw mm4,mm6 ; mm4=data2+data5=tmp2 + psubw mm2,mm1 ; mm2=data3-data4=tmp4 + psubw mm3,mm6 ; mm3=data2-data5=tmp5 + + ; -- Even part + + movq mm1,mm5 + movq mm6,mm0 + paddw mm5,mm7 ; mm5=tmp10 + paddw mm0,mm4 ; mm0=tmp11 + psubw mm1,mm7 ; mm1=tmp13 + psubw mm6,mm4 ; mm6=tmp12 + + movq mm7,mm5 + paddw mm5,mm0 ; mm5=tmp10+tmp11 + psubw mm7,mm0 ; mm7=tmp10-tmp11 + + paddw mm5,[GOTOFF(ebx,PW_DESCALE_P2X)] + paddw mm7,[GOTOFF(ebx,PW_DESCALE_P2X)] + psraw mm5,PASS1_BITS ; mm5=data0 + psraw mm7,PASS1_BITS ; mm7=data4 + + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movq mm4,mm1 ; mm1=tmp13 + movq mm0,mm1 + punpcklwd mm4,mm6 ; mm6=tmp12 + punpckhwd mm0,mm6 + movq mm1,mm4 + movq mm6,mm0 + pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L + pmaddwd mm0,[GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H + pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L + pmaddwd mm6,[GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H + + paddd mm4,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm0,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm4,DESCALE_P2 + psrad mm0,DESCALE_P2 + paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm6,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm1,DESCALE_P2 + psrad mm6,DESCALE_P2 + + packssdw mm4,mm0 ; mm4=data2 + packssdw mm1,mm6 ; mm1=data6 + + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1 + + ; -- Odd part + + movq mm5, MMWORD [wk(0)] ; mm5=tmp6 + movq mm7, MMWORD [wk(1)] ; mm7=tmp7 + + movq mm0,mm2 ; mm2=tmp4 + movq mm6,mm3 ; mm3=tmp5 + paddw mm0,mm5 ; mm0=z3 + paddw mm6,mm7 ; mm6=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movq mm4,mm0 + movq mm1,mm0 + punpcklwd mm4,mm6 + punpckhwd mm1,mm6 + movq mm0,mm4 + movq mm6,mm1 + pmaddwd mm4,[GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L + pmaddwd mm1,[GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H + pmaddwd mm0,[GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L + pmaddwd mm6,[GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H + + movq MMWORD [wk(0)], mm4 ; wk(0)=z3L + movq MMWORD [wk(1)], mm1 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movq mm4,mm2 + movq mm1,mm2 + punpcklwd mm4,mm7 + punpckhwd mm1,mm7 + movq mm2,mm4 + movq mm7,mm1 + pmaddwd mm4,[GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L + pmaddwd mm1,[GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H + pmaddwd mm2,[GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L + pmaddwd mm7,[GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H + + paddd mm4, MMWORD [wk(0)] ; mm4=data7L + paddd mm1, MMWORD [wk(1)] ; mm1=data7H + paddd mm2,mm0 ; mm2=data1L + paddd mm7,mm6 ; mm7=data1H + + paddd mm4,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm4,DESCALE_P2 + psrad mm1,DESCALE_P2 + paddd mm2,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm7,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm2,DESCALE_P2 + psrad mm7,DESCALE_P2 + + packssdw mm4,mm1 ; mm4=data7 + packssdw mm2,mm7 ; mm2=data1 + + movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2 + + movq mm1,mm3 + movq mm7,mm3 + punpcklwd mm1,mm5 + punpckhwd mm7,mm5 + movq mm3,mm1 + movq mm5,mm7 + pmaddwd mm1,[GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L + pmaddwd mm7,[GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H + pmaddwd mm3,[GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L + pmaddwd mm5,[GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H + + paddd mm1,mm0 ; mm1=data5L + paddd mm7,mm6 ; mm7=data5H + paddd mm3, MMWORD [wk(0)] ; mm3=data3L + paddd mm5, MMWORD [wk(1)] ; mm5=data3H + + paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm7,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm1,DESCALE_P2 + psrad mm7,DESCALE_P2 + paddd mm3,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm5,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm3,DESCALE_P2 + psrad mm5,DESCALE_P2 + + packssdw mm1,mm7 ; mm1=data5 + packssdw mm3,mm5 ; mm3=data3 + + movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3 + + add edx, byte 4*SIZEOF_DCTELEM + dec ecx + jnz near .columnloop + + emms ; empty MMX state + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jfdctint-sse2-64.asm b/media/libjpeg/simd/jfdctint-sse2-64.asm new file mode 100644 index 000000000..9a0ca0fd2 --- /dev/null +++ b/media/libjpeg/simd/jfdctint-sse2-64.asm @@ -0,0 +1,621 @@ +; +; jfdctint.asm - accurate integer FDCT (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slow-but-accurate integer implementation of the +; forward DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jfdctint.c; see the jfdctint.c for +; more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS-PASS1_BITS) +%define DESCALE_P2 (CONST_BITS+PASS1_BITS) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_fdct_islow_sse2) + +EXTN(jconst_fdct_islow_sse2): + +PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 +PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) +PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 +PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) +PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 +PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) +PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 +PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) +PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) +PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) +PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_islow_sse2 (DCTELEM *data) +; + +; r10 = DCTELEM *data + +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 6 + + align 16 + global EXTN(jsimd_fdct_islow_sse2) + +EXTN(jsimd_fdct_islow_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + + ; ---- Pass 1: process rows. + + mov rdx, r10 ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75) + + movdqa xmm7,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6,xmm1 + movdqa xmm3,xmm0 + psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2,xmm1 + movdqa xmm5,xmm7 + paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4,xmm3 + movdqa xmm0,xmm6 + paddw xmm3,xmm1 ; xmm3=tmp10 + paddw xmm6,xmm7 ; xmm6=tmp11 + psubw xmm4,xmm1 ; xmm4=tmp13 + psubw xmm0,xmm7 ; xmm0=tmp12 + + movdqa xmm1,xmm3 + paddw xmm3,xmm6 ; xmm3=tmp10+tmp11 + psubw xmm1,xmm6 ; xmm1=tmp10-tmp11 + + psllw xmm3,PASS1_BITS ; xmm3=data0 + psllw xmm1,PASS1_BITS ; xmm1=data4 + + movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0 + movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm7,xmm4 ; xmm4=tmp13 + movdqa xmm6,xmm4 + punpcklwd xmm7,xmm0 ; xmm0=tmp12 + punpckhwd xmm6,xmm0 + movdqa xmm4,xmm7 + movdqa xmm0,xmm6 + pmaddwd xmm7,[rel PW_F130_F054] ; xmm7=data2L + pmaddwd xmm6,[rel PW_F130_F054] ; xmm6=data2H + pmaddwd xmm4,[rel PW_F054_MF130] ; xmm4=data6L + pmaddwd xmm0,[rel PW_F054_MF130] ; xmm0=data6H + + paddd xmm7,[rel PD_DESCALE_P1] + paddd xmm6,[rel PD_DESCALE_P1] + psrad xmm7,DESCALE_P1 + psrad xmm6,DESCALE_P1 + paddd xmm4,[rel PD_DESCALE_P1] + paddd xmm0,[rel PD_DESCALE_P1] + psrad xmm4,DESCALE_P1 + psrad xmm0,DESCALE_P1 + + packssdw xmm7,xmm6 ; xmm7=data2 + packssdw xmm4,xmm0 ; xmm4=data6 + + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2 + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6 + + ; -- Odd part + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7 + + movdqa xmm6,xmm2 ; xmm2=tmp4 + movdqa xmm0,xmm5 ; xmm5=tmp5 + paddw xmm6,xmm3 ; xmm6=z3 + paddw xmm0,xmm1 ; xmm0=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm7,xmm6 + movdqa xmm4,xmm6 + punpcklwd xmm7,xmm0 + punpckhwd xmm4,xmm0 + movdqa xmm6,xmm7 + movdqa xmm0,xmm4 + pmaddwd xmm7,[rel PW_MF078_F117] ; xmm7=z3L + pmaddwd xmm4,[rel PW_MF078_F117] ; xmm4=z3H + pmaddwd xmm6,[rel PW_F117_F078] ; xmm6=z4L + pmaddwd xmm0,[rel PW_F117_F078] ; xmm0=z4H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm7,xmm2 + movdqa xmm4,xmm2 + punpcklwd xmm7,xmm1 + punpckhwd xmm4,xmm1 + movdqa xmm2,xmm7 + movdqa xmm1,xmm4 + pmaddwd xmm7,[rel PW_MF060_MF089] ; xmm7=tmp4L + pmaddwd xmm4,[rel PW_MF060_MF089] ; xmm4=tmp4H + pmaddwd xmm2,[rel PW_MF089_F060] ; xmm2=tmp7L + pmaddwd xmm1,[rel PW_MF089_F060] ; xmm1=tmp7H + + paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L + paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H + paddd xmm2,xmm6 ; xmm2=data1L + paddd xmm1,xmm0 ; xmm1=data1H + + paddd xmm7,[rel PD_DESCALE_P1] + paddd xmm4,[rel PD_DESCALE_P1] + psrad xmm7,DESCALE_P1 + psrad xmm4,DESCALE_P1 + paddd xmm2,[rel PD_DESCALE_P1] + paddd xmm1,[rel PD_DESCALE_P1] + psrad xmm2,DESCALE_P1 + psrad xmm1,DESCALE_P1 + + packssdw xmm7,xmm4 ; xmm7=data7 + packssdw xmm2,xmm1 ; xmm2=data1 + + movdqa xmm4,xmm5 + movdqa xmm1,xmm5 + punpcklwd xmm4,xmm3 + punpckhwd xmm1,xmm3 + movdqa xmm5,xmm4 + movdqa xmm3,xmm1 + pmaddwd xmm4,[rel PW_MF050_MF256] ; xmm4=tmp5L + pmaddwd xmm1,[rel PW_MF050_MF256] ; xmm1=tmp5H + pmaddwd xmm5,[rel PW_MF256_F050] ; xmm5=tmp6L + pmaddwd xmm3,[rel PW_MF256_F050] ; xmm3=tmp6H + + paddd xmm4,xmm6 ; xmm4=data5L + paddd xmm1,xmm0 ; xmm1=data5H + paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L + paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H + + paddd xmm4,[rel PD_DESCALE_P1] + paddd xmm1,[rel PD_DESCALE_P1] + psrad xmm4,DESCALE_P1 + psrad xmm1,DESCALE_P1 + paddd xmm5,[rel PD_DESCALE_P1] + paddd xmm3,[rel PD_DESCALE_P1] + psrad xmm5,DESCALE_P1 + psrad xmm3,DESCALE_P1 + + packssdw xmm4,xmm1 ; xmm4=data5 + packssdw xmm5,xmm3 ; xmm5=data3 + + ; ---- Pass 2: process columns. + + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0 + movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2 + + ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72) + ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73) + + movdqa xmm1,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm2 ; xmm6=(00 01 10 11 20 21 30 31) + punpckhwd xmm1,xmm2 ; xmm1=(40 41 50 51 60 61 70 71) + movdqa xmm3,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm5 ; xmm0=(02 03 12 13 22 23 32 33) + punpckhwd xmm3,xmm5 ; xmm3=(42 43 52 53 62 63 72 73) + + movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4 + movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6 + + ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76) + ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm0,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm4 ; xmm2=(04 05 14 15 24 25 34 35) + punpckhwd xmm0,xmm4 ; xmm0=(44 45 54 55 64 65 74 75) + movdqa xmm3,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm7 ; xmm5=(06 07 16 17 26 27 36 37) + punpckhwd xmm3,xmm7 ; xmm3=(46 47 56 57 66 67 76 77) + + movdqa xmm4,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm5 ; xmm2=(04 05 06 07 14 15 16 17) + punpckhdq xmm4,xmm5 ; xmm4=(24 25 26 27 34 35 36 37) + movdqa xmm7,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm3 ; xmm0=(44 45 46 47 54 55 56 57) + punpckhdq xmm7,xmm3 ; xmm7=(64 65 66 67 74 75 76 77) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57) + + movdqa xmm4,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm5 ; xmm6=(00 01 02 03 10 11 12 13) + punpckhdq xmm4,xmm5 ; xmm4=(20 21 22 23 30 31 32 33) + movdqa xmm0,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm3 ; xmm1=(40 41 42 43 50 51 52 53) + punpckhdq xmm0,xmm3 ; xmm0=(60 61 62 63 70 71 72 73) + + movdqa xmm5,xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6,xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm5,xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm3,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm3,xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm2,xmm5 + movdqa xmm7,xmm6 + psubw xmm5,xmm0 ; xmm5=data1-data6=tmp6 + psubw xmm6,xmm3 ; xmm6=data0-data7=tmp7 + paddw xmm2,xmm0 ; xmm2=data1+data6=tmp1 + paddw xmm7,xmm3 ; xmm7=data0+data7=tmp0 + + movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37) + movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movdqa xmm5,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm5,xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm6,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm6,xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm0,xmm5 + movdqa xmm3,xmm4 + paddw xmm5,xmm1 ; xmm5=data3+data4=tmp3 + paddw xmm4,xmm6 ; xmm4=data2+data5=tmp2 + psubw xmm0,xmm1 ; xmm0=data3-data4=tmp4 + psubw xmm3,xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm1,xmm7 + movdqa xmm6,xmm2 + paddw xmm7,xmm5 ; xmm7=tmp10 + paddw xmm2,xmm4 ; xmm2=tmp11 + psubw xmm1,xmm5 ; xmm1=tmp13 + psubw xmm6,xmm4 ; xmm6=tmp12 + + movdqa xmm5,xmm7 + paddw xmm7,xmm2 ; xmm7=tmp10+tmp11 + psubw xmm5,xmm2 ; xmm5=tmp10-tmp11 + + paddw xmm7,[rel PW_DESCALE_P2X] + paddw xmm5,[rel PW_DESCALE_P2X] + psraw xmm7,PASS1_BITS ; xmm7=data0 + psraw xmm5,PASS1_BITS ; xmm5=data4 + + movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7 + movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm4,xmm1 ; xmm1=tmp13 + movdqa xmm2,xmm1 + punpcklwd xmm4,xmm6 ; xmm6=tmp12 + punpckhwd xmm2,xmm6 + movdqa xmm1,xmm4 + movdqa xmm6,xmm2 + pmaddwd xmm4,[rel PW_F130_F054] ; xmm4=data2L + pmaddwd xmm2,[rel PW_F130_F054] ; xmm2=data2H + pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=data6L + pmaddwd xmm6,[rel PW_F054_MF130] ; xmm6=data6H + + paddd xmm4,[rel PD_DESCALE_P2] + paddd xmm2,[rel PD_DESCALE_P2] + psrad xmm4,DESCALE_P2 + psrad xmm2,DESCALE_P2 + paddd xmm1,[rel PD_DESCALE_P2] + paddd xmm6,[rel PD_DESCALE_P2] + psrad xmm1,DESCALE_P2 + psrad xmm6,DESCALE_P2 + + packssdw xmm4,xmm2 ; xmm4=data2 + packssdw xmm1,xmm6 ; xmm1=data6 + + movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1 + + ; -- Odd part + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + movdqa xmm2,xmm0 ; xmm0=tmp4 + movdqa xmm6,xmm3 ; xmm3=tmp5 + paddw xmm2,xmm7 ; xmm2=z3 + paddw xmm6,xmm5 ; xmm6=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm4,xmm2 + movdqa xmm1,xmm2 + punpcklwd xmm4,xmm6 + punpckhwd xmm1,xmm6 + movdqa xmm2,xmm4 + movdqa xmm6,xmm1 + pmaddwd xmm4,[rel PW_MF078_F117] ; xmm4=z3L + pmaddwd xmm1,[rel PW_MF078_F117] ; xmm1=z3H + pmaddwd xmm2,[rel PW_F117_F078] ; xmm2=z4L + pmaddwd xmm6,[rel PW_F117_F078] ; xmm6=z4H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm4,xmm0 + movdqa xmm1,xmm0 + punpcklwd xmm4,xmm5 + punpckhwd xmm1,xmm5 + movdqa xmm0,xmm4 + movdqa xmm5,xmm1 + pmaddwd xmm4,[rel PW_MF060_MF089] ; xmm4=tmp4L + pmaddwd xmm1,[rel PW_MF060_MF089] ; xmm1=tmp4H + pmaddwd xmm0,[rel PW_MF089_F060] ; xmm0=tmp7L + pmaddwd xmm5,[rel PW_MF089_F060] ; xmm5=tmp7H + + paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L + paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H + paddd xmm0,xmm2 ; xmm0=data1L + paddd xmm5,xmm6 ; xmm5=data1H + + paddd xmm4,[rel PD_DESCALE_P2] + paddd xmm1,[rel PD_DESCALE_P2] + psrad xmm4,DESCALE_P2 + psrad xmm1,DESCALE_P2 + paddd xmm0,[rel PD_DESCALE_P2] + paddd xmm5,[rel PD_DESCALE_P2] + psrad xmm0,DESCALE_P2 + psrad xmm5,DESCALE_P2 + + packssdw xmm4,xmm1 ; xmm4=data7 + packssdw xmm0,xmm5 ; xmm0=data1 + + movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0 + + movdqa xmm1,xmm3 + movdqa xmm5,xmm3 + punpcklwd xmm1,xmm7 + punpckhwd xmm5,xmm7 + movdqa xmm3,xmm1 + movdqa xmm7,xmm5 + pmaddwd xmm1,[rel PW_MF050_MF256] ; xmm1=tmp5L + pmaddwd xmm5,[rel PW_MF050_MF256] ; xmm5=tmp5H + pmaddwd xmm3,[rel PW_MF256_F050] ; xmm3=tmp6L + pmaddwd xmm7,[rel PW_MF256_F050] ; xmm7=tmp6H + + paddd xmm1,xmm2 ; xmm1=data5L + paddd xmm5,xmm6 ; xmm5=data5H + paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L + paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H + + paddd xmm1,[rel PD_DESCALE_P2] + paddd xmm5,[rel PD_DESCALE_P2] + psrad xmm1,DESCALE_P2 + psrad xmm5,DESCALE_P2 + paddd xmm3,[rel PD_DESCALE_P2] + paddd xmm7,[rel PD_DESCALE_P2] + psrad xmm3,DESCALE_P2 + psrad xmm7,DESCALE_P2 + + packssdw xmm1,xmm5 ; xmm1=data5 + packssdw xmm3,xmm7 ; xmm3=data3 + + movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3 + + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jfdctint-sse2.asm b/media/libjpeg/simd/jfdctint-sse2.asm new file mode 100644 index 000000000..db9d0bbe4 --- /dev/null +++ b/media/libjpeg/simd/jfdctint-sse2.asm @@ -0,0 +1,633 @@ +; +; jfdctint.asm - accurate integer FDCT (SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slow-but-accurate integer implementation of the +; forward DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jfdctint.c; see the jfdctint.c for +; more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS-PASS1_BITS) +%define DESCALE_P2 (CONST_BITS+PASS1_BITS) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_fdct_islow_sse2) + +EXTN(jconst_fdct_islow_sse2): + +PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 +PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) +PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 +PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) +PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 +PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) +PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 +PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) +PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) +PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) +PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform the forward DCT on one block of samples. +; +; GLOBAL(void) +; jsimd_fdct_islow_sse2 (DCTELEM *data) +; + +%define data(b) (b)+8 ; DCTELEM *data + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 6 + + align 16 + global EXTN(jsimd_fdct_islow_sse2) + +EXTN(jsimd_fdct_islow_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75) + + movdqa xmm7,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6,xmm1 + movdqa xmm3,xmm0 + psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2,xmm1 + movdqa xmm5,xmm7 + paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4,xmm3 + movdqa xmm0,xmm6 + paddw xmm3,xmm1 ; xmm3=tmp10 + paddw xmm6,xmm7 ; xmm6=tmp11 + psubw xmm4,xmm1 ; xmm4=tmp13 + psubw xmm0,xmm7 ; xmm0=tmp12 + + movdqa xmm1,xmm3 + paddw xmm3,xmm6 ; xmm3=tmp10+tmp11 + psubw xmm1,xmm6 ; xmm1=tmp10-tmp11 + + psllw xmm3,PASS1_BITS ; xmm3=data0 + psllw xmm1,PASS1_BITS ; xmm1=data4 + + movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0 + movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm7,xmm4 ; xmm4=tmp13 + movdqa xmm6,xmm4 + punpcklwd xmm7,xmm0 ; xmm0=tmp12 + punpckhwd xmm6,xmm0 + movdqa xmm4,xmm7 + movdqa xmm0,xmm6 + pmaddwd xmm7,[GOTOFF(ebx,PW_F130_F054)] ; xmm7=data2L + pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=data2H + pmaddwd xmm4,[GOTOFF(ebx,PW_F054_MF130)] ; xmm4=data6L + pmaddwd xmm0,[GOTOFF(ebx,PW_F054_MF130)] ; xmm0=data6H + + paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm7,DESCALE_P1 + psrad xmm6,DESCALE_P1 + paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm0,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm4,DESCALE_P1 + psrad xmm0,DESCALE_P1 + + packssdw xmm7,xmm6 ; xmm7=data2 + packssdw xmm4,xmm0 ; xmm4=data6 + + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2 + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6 + + ; -- Odd part + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7 + + movdqa xmm6,xmm2 ; xmm2=tmp4 + movdqa xmm0,xmm5 ; xmm5=tmp5 + paddw xmm6,xmm3 ; xmm6=z3 + paddw xmm0,xmm1 ; xmm0=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm7,xmm6 + movdqa xmm4,xmm6 + punpcklwd xmm7,xmm0 + punpckhwd xmm4,xmm0 + movdqa xmm6,xmm7 + movdqa xmm0,xmm4 + pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3L + pmaddwd xmm4,[GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3H + pmaddwd xmm6,[GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4L + pmaddwd xmm0,[GOTOFF(ebx,PW_F117_F078)] ; xmm0=z4H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm7,xmm2 + movdqa xmm4,xmm2 + punpcklwd xmm7,xmm1 + punpckhwd xmm4,xmm1 + movdqa xmm2,xmm7 + movdqa xmm1,xmm4 + pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp4L + pmaddwd xmm4,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4H + pmaddwd xmm2,[GOTOFF(ebx,PW_MF089_F060)] ; xmm2=tmp7L + pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp7H + + paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L + paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H + paddd xmm2,xmm6 ; xmm2=data1L + paddd xmm1,xmm0 ; xmm1=data1H + + paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm7,DESCALE_P1 + psrad xmm4,DESCALE_P1 + paddd xmm2,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm2,DESCALE_P1 + psrad xmm1,DESCALE_P1 + + packssdw xmm7,xmm4 ; xmm7=data7 + packssdw xmm2,xmm1 ; xmm2=data1 + + movdqa xmm4,xmm5 + movdqa xmm1,xmm5 + punpcklwd xmm4,xmm3 + punpckhwd xmm1,xmm3 + movdqa xmm5,xmm4 + movdqa xmm3,xmm1 + pmaddwd xmm4,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm4=tmp5L + pmaddwd xmm1,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5H + pmaddwd xmm5,[GOTOFF(ebx,PW_MF256_F050)] ; xmm5=tmp6L + pmaddwd xmm3,[GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6H + + paddd xmm4,xmm6 ; xmm4=data5L + paddd xmm1,xmm0 ; xmm1=data5H + paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L + paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H + + paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm4,DESCALE_P1 + psrad xmm1,DESCALE_P1 + paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm3,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm5,DESCALE_P1 + psrad xmm3,DESCALE_P1 + + packssdw xmm4,xmm1 ; xmm4=data5 + packssdw xmm5,xmm3 ; xmm5=data3 + + ; ---- Pass 2: process columns. + +; mov edx, POINTER [data(eax)] ; (DCTELEM *) + + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0 + movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2 + + ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72) + ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73) + + movdqa xmm1,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm2 ; xmm6=(00 01 10 11 20 21 30 31) + punpckhwd xmm1,xmm2 ; xmm1=(40 41 50 51 60 61 70 71) + movdqa xmm3,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm5 ; xmm0=(02 03 12 13 22 23 32 33) + punpckhwd xmm3,xmm5 ; xmm3=(42 43 52 53 62 63 72 73) + + movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4 + movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6 + + ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76) + ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm0,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm4 ; xmm2=(04 05 14 15 24 25 34 35) + punpckhwd xmm0,xmm4 ; xmm0=(44 45 54 55 64 65 74 75) + movdqa xmm3,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm7 ; xmm5=(06 07 16 17 26 27 36 37) + punpckhwd xmm3,xmm7 ; xmm3=(46 47 56 57 66 67 76 77) + + movdqa xmm4,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm5 ; xmm2=(04 05 06 07 14 15 16 17) + punpckhdq xmm4,xmm5 ; xmm4=(24 25 26 27 34 35 36 37) + movdqa xmm7,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm3 ; xmm0=(44 45 46 47 54 55 56 57) + punpckhdq xmm7,xmm3 ; xmm7=(64 65 66 67 74 75 76 77) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57) + + movdqa xmm4,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm5 ; xmm6=(00 01 02 03 10 11 12 13) + punpckhdq xmm4,xmm5 ; xmm4=(20 21 22 23 30 31 32 33) + movdqa xmm0,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm3 ; xmm1=(40 41 42 43 50 51 52 53) + punpckhdq xmm0,xmm3 ; xmm0=(60 61 62 63 70 71 72 73) + + movdqa xmm5,xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6,xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm5,xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm3,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm3,xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm2,xmm5 + movdqa xmm7,xmm6 + psubw xmm5,xmm0 ; xmm5=data1-data6=tmp6 + psubw xmm6,xmm3 ; xmm6=data0-data7=tmp7 + paddw xmm2,xmm0 ; xmm2=data1+data6=tmp1 + paddw xmm7,xmm3 ; xmm7=data0+data7=tmp0 + + movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37) + movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movdqa xmm5,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm5,xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm6,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm6,xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm0,xmm5 + movdqa xmm3,xmm4 + paddw xmm5,xmm1 ; xmm5=data3+data4=tmp3 + paddw xmm4,xmm6 ; xmm4=data2+data5=tmp2 + psubw xmm0,xmm1 ; xmm0=data3-data4=tmp4 + psubw xmm3,xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm1,xmm7 + movdqa xmm6,xmm2 + paddw xmm7,xmm5 ; xmm7=tmp10 + paddw xmm2,xmm4 ; xmm2=tmp11 + psubw xmm1,xmm5 ; xmm1=tmp13 + psubw xmm6,xmm4 ; xmm6=tmp12 + + movdqa xmm5,xmm7 + paddw xmm7,xmm2 ; xmm7=tmp10+tmp11 + psubw xmm5,xmm2 ; xmm5=tmp10-tmp11 + + paddw xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)] + paddw xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)] + psraw xmm7,PASS1_BITS ; xmm7=data0 + psraw xmm5,PASS1_BITS ; xmm5=data4 + + movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7 + movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm4,xmm1 ; xmm1=tmp13 + movdqa xmm2,xmm1 + punpcklwd xmm4,xmm6 ; xmm6=tmp12 + punpckhwd xmm2,xmm6 + movdqa xmm1,xmm4 + movdqa xmm6,xmm2 + pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=data2L + pmaddwd xmm2,[GOTOFF(ebx,PW_F130_F054)] ; xmm2=data2H + pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=data6L + pmaddwd xmm6,[GOTOFF(ebx,PW_F054_MF130)] ; xmm6=data6H + + paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm2,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm4,DESCALE_P2 + psrad xmm2,DESCALE_P2 + paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm1,DESCALE_P2 + psrad xmm6,DESCALE_P2 + + packssdw xmm4,xmm2 ; xmm4=data2 + packssdw xmm1,xmm6 ; xmm1=data6 + + movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1 + + ; -- Odd part + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + movdqa xmm2,xmm0 ; xmm0=tmp4 + movdqa xmm6,xmm3 ; xmm3=tmp5 + paddw xmm2,xmm7 ; xmm2=z3 + paddw xmm6,xmm5 ; xmm6=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm4,xmm2 + movdqa xmm1,xmm2 + punpcklwd xmm4,xmm6 + punpckhwd xmm1,xmm6 + movdqa xmm2,xmm4 + movdqa xmm6,xmm1 + pmaddwd xmm4,[GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3L + pmaddwd xmm1,[GOTOFF(ebx,PW_MF078_F117)] ; xmm1=z3H + pmaddwd xmm2,[GOTOFF(ebx,PW_F117_F078)] ; xmm2=z4L + pmaddwd xmm6,[GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm4,xmm0 + movdqa xmm1,xmm0 + punpcklwd xmm4,xmm5 + punpckhwd xmm1,xmm5 + movdqa xmm0,xmm4 + movdqa xmm5,xmm1 + pmaddwd xmm4,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4L + pmaddwd xmm1,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm1=tmp4H + pmaddwd xmm0,[GOTOFF(ebx,PW_MF089_F060)] ; xmm0=tmp7L + pmaddwd xmm5,[GOTOFF(ebx,PW_MF089_F060)] ; xmm5=tmp7H + + paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L + paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H + paddd xmm0,xmm2 ; xmm0=data1L + paddd xmm5,xmm6 ; xmm5=data1H + + paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm4,DESCALE_P2 + psrad xmm1,DESCALE_P2 + paddd xmm0,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm0,DESCALE_P2 + psrad xmm5,DESCALE_P2 + + packssdw xmm4,xmm1 ; xmm4=data7 + packssdw xmm0,xmm5 ; xmm0=data1 + + movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0 + + movdqa xmm1,xmm3 + movdqa xmm5,xmm3 + punpcklwd xmm1,xmm7 + punpckhwd xmm5,xmm7 + movdqa xmm3,xmm1 + movdqa xmm7,xmm5 + pmaddwd xmm1,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5L + pmaddwd xmm5,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm5=tmp5H + pmaddwd xmm3,[GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6L + pmaddwd xmm7,[GOTOFF(ebx,PW_MF256_F050)] ; xmm7=tmp6H + + paddd xmm1,xmm2 ; xmm1=data5L + paddd xmm5,xmm6 ; xmm5=data5H + paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L + paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H + + paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm1,DESCALE_P2 + psrad xmm5,DESCALE_P2 + paddd xmm3,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm3,DESCALE_P2 + psrad xmm7,DESCALE_P2 + + packssdw xmm1,xmm5 ; xmm1=data5 + packssdw xmm3,xmm7 ; xmm3=data3 + + movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3 + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jidctflt-3dn.asm b/media/libjpeg/simd/jidctflt-3dn.asm new file mode 100644 index 000000000..99356f20a --- /dev/null +++ b/media/libjpeg/simd/jidctflt-3dn.asm @@ -0,0 +1,451 @@ +; +; jidctflt.asm - floating-point IDCT (3DNow! & MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the inverse DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jidctflt.c; see the jidctflt.c for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_idct_float_3dnow) + +EXTN(jconst_idct_float_3dnow): + +PD_1_414 times 2 dd 1.414213562373095048801689 +PD_1_847 times 2 dd 1.847759065022573512256366 +PD_1_082 times 2 dd 1.082392200292393968799446 +PD_2_613 times 2 dd 2.613125929752753055713286 +PD_RNDINT_MAGIC times 2 dd 100663296.0 ; (float)(0x00C00000 << 3) +PB_CENTERJSAMP times 8 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_float_3dnow (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b)+8 ; void *dct_table +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT + ; FAST_FLOAT workspace[DCTSIZE2] + + align 16 + global EXTN(jsimd_idct_float_3dnow) + +EXTN(jsimd_idct_float_3dnow): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; FAST_FLOAT *wsptr + mov ecx, DCTSIZE/2 ; ctr + alignx 16,7 +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + pushpic ebx ; save GOT address + mov ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] + mov eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] + or ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] + or ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] + or eax,ebx + poppic ebx ; restore GOT address + jnz short .columnDCT + + ; -- AC terms all zero + + movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] + + punpcklwd mm0,mm0 + psrad mm0,(DWORD_BIT-WORD_BIT) + pi2fd mm0,mm0 + + pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movq mm1,mm0 + punpckldq mm0,mm0 + punpckhdq mm1,mm1 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1 + movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1 + movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1 + jmp near .nextcolumn + alignx 16,7 +%endif +.columnDCT: + + ; -- Even part + + movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] + movd mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + movd mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] + movd mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] + + punpcklwd mm0,mm0 + punpcklwd mm1,mm1 + psrad mm0,(DWORD_BIT-WORD_BIT) + psrad mm1,(DWORD_BIT-WORD_BIT) + pi2fd mm0,mm0 + pi2fd mm1,mm1 + + pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + pfmul mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + punpcklwd mm2,mm2 + punpcklwd mm3,mm3 + psrad mm2,(DWORD_BIT-WORD_BIT) + psrad mm3,(DWORD_BIT-WORD_BIT) + pi2fd mm2,mm2 + pi2fd mm3,mm3 + + pfmul mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + pfmul mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movq mm4,mm0 + movq mm5,mm1 + pfsub mm0,mm2 ; mm0=tmp11 + pfsub mm1,mm3 + pfadd mm4,mm2 ; mm4=tmp10 + pfadd mm5,mm3 ; mm5=tmp13 + + pfmul mm1,[GOTOFF(ebx,PD_1_414)] + pfsub mm1,mm5 ; mm1=tmp12 + + movq mm6,mm4 + movq mm7,mm0 + pfsub mm4,mm5 ; mm4=tmp3 + pfsub mm0,mm1 ; mm0=tmp2 + pfadd mm6,mm5 ; mm6=tmp0 + pfadd mm7,mm1 ; mm7=tmp1 + + movq MMWORD [wk(1)], mm4 ; tmp3 + movq MMWORD [wk(0)], mm0 ; tmp2 + + ; -- Odd part + + movd mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + movd mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] + movd mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] + movd mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] + + punpcklwd mm2,mm2 + punpcklwd mm3,mm3 + psrad mm2,(DWORD_BIT-WORD_BIT) + psrad mm3,(DWORD_BIT-WORD_BIT) + pi2fd mm2,mm2 + pi2fd mm3,mm3 + + pfmul mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + pfmul mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + punpcklwd mm5,mm5 + punpcklwd mm1,mm1 + psrad mm5,(DWORD_BIT-WORD_BIT) + psrad mm1,(DWORD_BIT-WORD_BIT) + pi2fd mm5,mm5 + pi2fd mm1,mm1 + + pfmul mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + pfmul mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movq mm4,mm2 + movq mm0,mm5 + pfadd mm2,mm1 ; mm2=z11 + pfadd mm5,mm3 ; mm5=z13 + pfsub mm4,mm1 ; mm4=z12 + pfsub mm0,mm3 ; mm0=z10 + + movq mm1,mm2 + pfsub mm2,mm5 + pfadd mm1,mm5 ; mm1=tmp7 + + pfmul mm2,[GOTOFF(ebx,PD_1_414)] ; mm2=tmp11 + + movq mm3,mm0 + pfadd mm0,mm4 + pfmul mm0,[GOTOFF(ebx,PD_1_847)] ; mm0=z5 + pfmul mm3,[GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930) + pfmul mm4,[GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200) + pfsubr mm3,mm0 ; mm3=tmp12 + pfsub mm4,mm0 ; mm4=tmp10 + + ; -- Final output stage + + pfsub mm3,mm1 ; mm3=tmp6 + movq mm5,mm6 + movq mm0,mm7 + pfadd mm6,mm1 ; mm6=data0=(00 01) + pfadd mm7,mm3 ; mm7=data1=(10 11) + pfsub mm5,mm1 ; mm5=data7=(70 71) + pfsub mm0,mm3 ; mm0=data6=(60 61) + pfsub mm2,mm3 ; mm2=tmp5 + + movq mm1,mm6 ; transpose coefficients + punpckldq mm6,mm7 ; mm6=(00 10) + punpckhdq mm1,mm7 ; mm1=(01 11) + movq mm3,mm0 ; transpose coefficients + punpckldq mm0,mm5 ; mm0=(60 70) + punpckhdq mm3,mm5 ; mm3=(61 71) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1 + movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3 + + movq mm7, MMWORD [wk(0)] ; mm7=tmp2 + movq mm5, MMWORD [wk(1)] ; mm5=tmp3 + + pfadd mm4,mm2 ; mm4=tmp4 + movq mm6,mm7 + movq mm1,mm5 + pfadd mm7,mm2 ; mm7=data2=(20 21) + pfadd mm5,mm4 ; mm5=data4=(40 41) + pfsub mm6,mm2 ; mm6=data5=(50 51) + pfsub mm1,mm4 ; mm1=data3=(30 31) + + movq mm0,mm7 ; transpose coefficients + punpckldq mm7,mm1 ; mm7=(20 30) + punpckhdq mm0,mm1 ; mm0=(21 31) + movq mm3,mm5 ; transpose coefficients + punpckldq mm5,mm6 ; mm5=(40 50) + punpckhdq mm3,mm6 ; mm3=(41 51) + + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5 + movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3 + +.nextcolumn: + add esi, byte 2*SIZEOF_JCOEF ; coef_block + add edx, byte 2*SIZEOF_FLOAT_MULT_TYPE ; quantptr + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; -- Prefetch the next coefficient block + + prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] + prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] + prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] + prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; FAST_FLOAT *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/2 ; ctr + alignx 16,7 +.rowloop: + + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] + + movq mm4,mm0 + movq mm5,mm1 + pfsub mm0,mm2 ; mm0=tmp11 + pfsub mm1,mm3 + pfadd mm4,mm2 ; mm4=tmp10 + pfadd mm5,mm3 ; mm5=tmp13 + + pfmul mm1,[GOTOFF(ebx,PD_1_414)] + pfsub mm1,mm5 ; mm1=tmp12 + + movq mm6,mm4 + movq mm7,mm0 + pfsub mm4,mm5 ; mm4=tmp3 + pfsub mm0,mm1 ; mm0=tmp2 + pfadd mm6,mm5 ; mm6=tmp0 + pfadd mm7,mm1 ; mm7=tmp1 + + movq MMWORD [wk(1)], mm4 ; tmp3 + movq MMWORD [wk(0)], mm0 ; tmp2 + + ; -- Odd part + + movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] + movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] + + movq mm4,mm2 + movq mm0,mm5 + pfadd mm2,mm1 ; mm2=z11 + pfadd mm5,mm3 ; mm5=z13 + pfsub mm4,mm1 ; mm4=z12 + pfsub mm0,mm3 ; mm0=z10 + + movq mm1,mm2 + pfsub mm2,mm5 + pfadd mm1,mm5 ; mm1=tmp7 + + pfmul mm2,[GOTOFF(ebx,PD_1_414)] ; mm2=tmp11 + + movq mm3,mm0 + pfadd mm0,mm4 + pfmul mm0,[GOTOFF(ebx,PD_1_847)] ; mm0=z5 + pfmul mm3,[GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930) + pfmul mm4,[GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200) + pfsubr mm3,mm0 ; mm3=tmp12 + pfsub mm4,mm0 ; mm4=tmp10 + + ; -- Final output stage + + pfsub mm3,mm1 ; mm3=tmp6 + movq mm5,mm6 + movq mm0,mm7 + pfadd mm6,mm1 ; mm6=data0=(00 10) + pfadd mm7,mm3 ; mm7=data1=(01 11) + pfsub mm5,mm1 ; mm5=data7=(07 17) + pfsub mm0,mm3 ; mm0=data6=(06 16) + pfsub mm2,mm3 ; mm2=tmp5 + + movq mm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm1=[PD_RNDINT_MAGIC] + pcmpeqd mm3,mm3 + psrld mm3,WORD_BIT ; mm3={0xFFFF 0x0000 0xFFFF 0x0000} + + pfadd mm6,mm1 ; mm6=roundint(data0/8)=(00 ** 10 **) + pfadd mm7,mm1 ; mm7=roundint(data1/8)=(01 ** 11 **) + pfadd mm0,mm1 ; mm0=roundint(data6/8)=(06 ** 16 **) + pfadd mm5,mm1 ; mm5=roundint(data7/8)=(07 ** 17 **) + + pand mm6,mm3 ; mm6=(00 -- 10 --) + pslld mm7,WORD_BIT ; mm7=(-- 01 -- 11) + pand mm0,mm3 ; mm0=(06 -- 16 --) + pslld mm5,WORD_BIT ; mm5=(-- 07 -- 17) + por mm6,mm7 ; mm6=(00 01 10 11) + por mm0,mm5 ; mm0=(06 07 16 17) + + movq mm1, MMWORD [wk(0)] ; mm1=tmp2 + movq mm3, MMWORD [wk(1)] ; mm3=tmp3 + + pfadd mm4,mm2 ; mm4=tmp4 + movq mm7,mm1 + movq mm5,mm3 + pfadd mm1,mm2 ; mm1=data2=(02 12) + pfadd mm3,mm4 ; mm3=data4=(04 14) + pfsub mm7,mm2 ; mm7=data5=(05 15) + pfsub mm5,mm4 ; mm5=data3=(03 13) + + movq mm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm2=[PD_RNDINT_MAGIC] + pcmpeqd mm4,mm4 + psrld mm4,WORD_BIT ; mm4={0xFFFF 0x0000 0xFFFF 0x0000} + + pfadd mm3,mm2 ; mm3=roundint(data4/8)=(04 ** 14 **) + pfadd mm7,mm2 ; mm7=roundint(data5/8)=(05 ** 15 **) + pfadd mm1,mm2 ; mm1=roundint(data2/8)=(02 ** 12 **) + pfadd mm5,mm2 ; mm5=roundint(data3/8)=(03 ** 13 **) + + pand mm3,mm4 ; mm3=(04 -- 14 --) + pslld mm7,WORD_BIT ; mm7=(-- 05 -- 15) + pand mm1,mm4 ; mm1=(02 -- 12 --) + pslld mm5,WORD_BIT ; mm5=(-- 03 -- 13) + por mm3,mm7 ; mm3=(04 05 14 15) + por mm1,mm5 ; mm1=(02 03 12 13) + + movq mm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm2=[PB_CENTERJSAMP] + + packsswb mm6,mm3 ; mm6=(00 01 10 11 04 05 14 15) + packsswb mm1,mm0 ; mm1=(02 03 12 13 06 07 16 17) + paddb mm6,mm2 + paddb mm1,mm2 + + movq mm4,mm6 ; transpose coefficients(phase 2) + punpcklwd mm6,mm1 ; mm6=(00 01 02 03 10 11 12 13) + punpckhwd mm4,mm1 ; mm4=(04 05 06 07 14 15 16 17) + + movq mm7,mm6 ; transpose coefficients(phase 3) + punpckldq mm6,mm4 ; mm6=(00 01 02 03 04 05 06 07) + punpckhdq mm7,mm4 ; mm7=(10 11 12 13 14 15 16 17) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7 + + poppic ebx ; restore GOT address + + add esi, byte 2*SIZEOF_FAST_FLOAT ; wsptr + add edi, byte 2*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + femms ; empty MMX/3DNow! state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jidctflt-sse.asm b/media/libjpeg/simd/jidctflt-sse.asm new file mode 100644 index 000000000..4d4af2fff --- /dev/null +++ b/media/libjpeg/simd/jidctflt-sse.asm @@ -0,0 +1,571 @@ +; +; jidctflt.asm - floating-point IDCT (SSE & MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the inverse DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jidctflt.c; see the jidctflt.c for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1,%2,0x44 +%endmacro + +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1,%2,0xEE +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_idct_float_sse) + +EXTN(jconst_idct_float_sse): + +PD_1_414 times 4 dd 1.414213562373095048801689 +PD_1_847 times 4 dd 1.847759065022573512256366 +PD_1_082 times 4 dd 1.082392200292393968799446 +PD_M2_613 times 4 dd -2.613125929752753055713286 +PD_0_125 times 4 dd 0.125 ; 1/8 +PB_CENTERJSAMP times 8 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_float_sse (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b)+8 ; void *dct_table +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT + ; FAST_FLOAT workspace[DCTSIZE2] + + align 16 + global EXTN(jsimd_idct_float_sse) + +EXTN(jsimd_idct_float_sse): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; FAST_FLOAT *wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por mm1,mm0 + packsswb mm1,mm1 + movd eax,mm1 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + + punpckhwd mm1,mm0 ; mm1=(** 02 ** 03) + punpcklwd mm0,mm0 ; mm0=(00 00 01 01) + psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in0H=(02 03) + psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01) + cvtpi2ps xmm3,mm1 ; xmm3=(02 03 ** **) + cvtpi2ps xmm0,mm0 ; xmm0=(00 01 ** **) + movlhps xmm0,xmm3 ; xmm0=in0=(00 01 02 03) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm1,xmm0 + movaps xmm2,xmm0 + movaps xmm3,xmm0 + + shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) + shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) + shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) + shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 + jmp near .nextcolumn + alignx 16,7 +%endif +.columnDCT: + + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + punpckhwd mm4,mm0 ; mm4=(** 02 ** 03) + punpcklwd mm0,mm0 ; mm0=(00 00 01 01) + punpckhwd mm5,mm1 ; mm5=(** 22 ** 23) + punpcklwd mm1,mm1 ; mm1=(20 20 21 21) + + psrad mm4,(DWORD_BIT-WORD_BIT) ; mm4=in0H=(02 03) + psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01) + cvtpi2ps xmm4,mm4 ; xmm4=(02 03 ** **) + cvtpi2ps xmm0,mm0 ; xmm0=(00 01 ** **) + psrad mm5,(DWORD_BIT-WORD_BIT) ; mm5=in2H=(22 23) + psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in2L=(20 21) + cvtpi2ps xmm5,mm5 ; xmm5=(22 23 ** **) + cvtpi2ps xmm1,mm1 ; xmm1=(20 21 ** **) + + punpckhwd mm6,mm2 ; mm6=(** 42 ** 43) + punpcklwd mm2,mm2 ; mm2=(40 40 41 41) + punpckhwd mm7,mm3 ; mm7=(** 62 ** 63) + punpcklwd mm3,mm3 ; mm3=(60 60 61 61) + + psrad mm6,(DWORD_BIT-WORD_BIT) ; mm6=in4H=(42 43) + psrad mm2,(DWORD_BIT-WORD_BIT) ; mm2=in4L=(40 41) + cvtpi2ps xmm6,mm6 ; xmm6=(42 43 ** **) + cvtpi2ps xmm2,mm2 ; xmm2=(40 41 ** **) + psrad mm7,(DWORD_BIT-WORD_BIT) ; mm7=in6H=(62 63) + psrad mm3,(DWORD_BIT-WORD_BIT) ; mm3=in6L=(60 61) + cvtpi2ps xmm7,mm7 ; xmm7=(62 63 ** **) + cvtpi2ps xmm3,mm3 ; xmm3=(60 61 ** **) + + movlhps xmm0,xmm4 ; xmm0=in0=(00 01 02 03) + movlhps xmm1,xmm5 ; xmm1=in2=(20 21 22 23) + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movlhps xmm2,xmm6 ; xmm2=in4=(40 41 42 43) + movlhps xmm3,xmm7 ; xmm3=in6=(60 61 62 63) + mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4,xmm0 + movaps xmm5,xmm1 + subps xmm0,xmm2 ; xmm0=tmp11 + subps xmm1,xmm3 + addps xmm4,xmm2 ; xmm4=tmp10 + addps xmm5,xmm3 ; xmm5=tmp13 + + mulps xmm1,[GOTOFF(ebx,PD_1_414)] + subps xmm1,xmm5 ; xmm1=tmp12 + + movaps xmm6,xmm4 + movaps xmm7,xmm0 + subps xmm4,xmm5 ; xmm4=tmp3 + subps xmm0,xmm1 ; xmm0=tmp2 + addps xmm6,xmm5 ; xmm6=tmp0 + addps xmm7,xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + punpckhwd mm6,mm4 ; mm6=(** 12 ** 13) + punpcklwd mm4,mm4 ; mm4=(10 10 11 11) + punpckhwd mm2,mm0 ; mm2=(** 32 ** 33) + punpcklwd mm0,mm0 ; mm0=(30 30 31 31) + + psrad mm6,(DWORD_BIT-WORD_BIT) ; mm6=in1H=(12 13) + psrad mm4,(DWORD_BIT-WORD_BIT) ; mm4=in1L=(10 11) + cvtpi2ps xmm4,mm6 ; xmm4=(12 13 ** **) + cvtpi2ps xmm2,mm4 ; xmm2=(10 11 ** **) + psrad mm2,(DWORD_BIT-WORD_BIT) ; mm2=in3H=(32 33) + psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in3L=(30 31) + cvtpi2ps xmm0,mm2 ; xmm0=(32 33 ** **) + cvtpi2ps xmm3,mm0 ; xmm3=(30 31 ** **) + + punpckhwd mm7,mm5 ; mm7=(** 52 ** 53) + punpcklwd mm5,mm5 ; mm5=(50 50 51 51) + punpckhwd mm3,mm1 ; mm3=(** 72 ** 73) + punpcklwd mm1,mm1 ; mm1=(70 70 71 71) + + movlhps xmm2,xmm4 ; xmm2=in1=(10 11 12 13) + movlhps xmm3,xmm0 ; xmm3=in3=(30 31 32 33) + + psrad mm7,(DWORD_BIT-WORD_BIT) ; mm7=in5H=(52 53) + psrad mm5,(DWORD_BIT-WORD_BIT) ; mm5=in5L=(50 51) + cvtpi2ps xmm4,mm7 ; xmm4=(52 53 ** **) + cvtpi2ps xmm5,mm5 ; xmm5=(50 51 ** **) + psrad mm3,(DWORD_BIT-WORD_BIT) ; mm3=in7H=(72 73) + psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in7L=(70 71) + cvtpi2ps xmm0,mm3 ; xmm0=(72 73 ** **) + cvtpi2ps xmm1,mm1 ; xmm1=(70 71 ** **) + + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movlhps xmm5,xmm4 ; xmm5=in5=(50 51 52 53) + movlhps xmm1,xmm0 ; xmm1=in7=(70 71 72 73) + mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4,xmm2 + movaps xmm0,xmm5 + addps xmm2,xmm1 ; xmm2=z11 + addps xmm5,xmm3 ; xmm5=z13 + subps xmm4,xmm1 ; xmm4=z12 + subps xmm0,xmm3 ; xmm0=z10 + + movaps xmm1,xmm2 + subps xmm2,xmm5 + addps xmm1,xmm5 ; xmm1=tmp7 + + mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 + + movaps xmm3,xmm0 + addps xmm0,xmm4 + mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 + mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) + mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) + addps xmm3,xmm0 ; xmm3=tmp12 + subps xmm4,xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3,xmm1 ; xmm3=tmp6 + movaps xmm5,xmm6 + movaps xmm0,xmm7 + addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) + addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) + subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) + subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) + subps xmm2,xmm3 ; xmm2=tmp5 + + movaps xmm1,xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) + movaps xmm3,xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) + unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 + + movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) + movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) + + addps xmm4,xmm2 ; xmm4=tmp4 + movaps xmm0,xmm7 + movaps xmm3,xmm5 + addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) + addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) + subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) + subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) + + movaps xmm2,xmm7 ; transpose coefficients(phase 1) + unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) + unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) + movaps xmm4,xmm5 ; transpose coefficients(phase 1) + unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) + unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) + + movaps xmm3,xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) + unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) + movaps xmm0,xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) + unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) + movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + + movaps xmm6,xmm5 ; transpose coefficients(phase 2) + unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) + unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) + movaps xmm3,xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) + unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) + + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 + movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 + +.nextcolumn: + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr + add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; FAST_FLOAT *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 +.rowloop: + + ; -- Even part + + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] + + movaps xmm4,xmm0 + movaps xmm5,xmm1 + subps xmm0,xmm2 ; xmm0=tmp11 + subps xmm1,xmm3 + addps xmm4,xmm2 ; xmm4=tmp10 + addps xmm5,xmm3 ; xmm5=tmp13 + + mulps xmm1,[GOTOFF(ebx,PD_1_414)] + subps xmm1,xmm5 ; xmm1=tmp12 + + movaps xmm6,xmm4 + movaps xmm7,xmm0 + subps xmm4,xmm5 ; xmm4=tmp3 + subps xmm0,xmm1 ; xmm0=tmp2 + addps xmm6,xmm5 ; xmm6=tmp0 + addps xmm7,xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] + + movaps xmm4,xmm2 + movaps xmm0,xmm5 + addps xmm2,xmm1 ; xmm2=z11 + addps xmm5,xmm3 ; xmm5=z13 + subps xmm4,xmm1 ; xmm4=z12 + subps xmm0,xmm3 ; xmm0=z10 + + movaps xmm1,xmm2 + subps xmm2,xmm5 + addps xmm1,xmm5 ; xmm1=tmp7 + + mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 + + movaps xmm3,xmm0 + addps xmm0,xmm4 + mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 + mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) + mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) + addps xmm3,xmm0 ; xmm3=tmp12 + subps xmm4,xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3,xmm1 ; xmm3=tmp6 + movaps xmm5,xmm6 + movaps xmm0,xmm7 + addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) + addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) + subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) + subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) + subps xmm2,xmm3 ; xmm2=tmp5 + + movaps xmm1,[GOTOFF(ebx,PD_0_125)] ; xmm1=[PD_0_125] + + mulps xmm6,xmm1 ; descale(1/8) + mulps xmm7,xmm1 ; descale(1/8) + mulps xmm5,xmm1 ; descale(1/8) + mulps xmm0,xmm1 ; descale(1/8) + + movhlps xmm3,xmm6 + movhlps xmm1,xmm7 + cvtps2pi mm0,xmm6 ; round to int32, mm0=data0L=(00 10) + cvtps2pi mm1,xmm7 ; round to int32, mm1=data1L=(01 11) + cvtps2pi mm2,xmm3 ; round to int32, mm2=data0H=(20 30) + cvtps2pi mm3,xmm1 ; round to int32, mm3=data1H=(21 31) + packssdw mm0,mm2 ; mm0=data0=(00 10 20 30) + packssdw mm1,mm3 ; mm1=data1=(01 11 21 31) + + movhlps xmm6,xmm5 + movhlps xmm7,xmm0 + cvtps2pi mm4,xmm5 ; round to int32, mm4=data7L=(07 17) + cvtps2pi mm5,xmm0 ; round to int32, mm5=data6L=(06 16) + cvtps2pi mm6,xmm6 ; round to int32, mm6=data7H=(27 37) + cvtps2pi mm7,xmm7 ; round to int32, mm7=data6H=(26 36) + packssdw mm4,mm6 ; mm4=data7=(07 17 27 37) + packssdw mm5,mm7 ; mm5=data6=(06 16 26 36) + + packsswb mm0,mm5 ; mm0=(00 10 20 30 06 16 26 36) + packsswb mm1,mm4 ; mm1=(01 11 21 31 07 17 27 37) + + movaps xmm3, XMMWORD [wk(0)] ; xmm3=tmp2 + movaps xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 + + movaps xmm6,[GOTOFF(ebx,PD_0_125)] ; xmm6=[PD_0_125] + + addps xmm4,xmm2 ; xmm4=tmp4 + movaps xmm5,xmm3 + movaps xmm0,xmm1 + addps xmm3,xmm2 ; xmm3=data2=(02 12 22 32) + addps xmm1,xmm4 ; xmm1=data4=(04 14 24 34) + subps xmm5,xmm2 ; xmm5=data5=(05 15 25 35) + subps xmm0,xmm4 ; xmm0=data3=(03 13 23 33) + + mulps xmm3,xmm6 ; descale(1/8) + mulps xmm1,xmm6 ; descale(1/8) + mulps xmm5,xmm6 ; descale(1/8) + mulps xmm0,xmm6 ; descale(1/8) + + movhlps xmm7,xmm3 + movhlps xmm2,xmm1 + cvtps2pi mm2,xmm3 ; round to int32, mm2=data2L=(02 12) + cvtps2pi mm3,xmm1 ; round to int32, mm3=data4L=(04 14) + cvtps2pi mm6,xmm7 ; round to int32, mm6=data2H=(22 32) + cvtps2pi mm7,xmm2 ; round to int32, mm7=data4H=(24 34) + packssdw mm2,mm6 ; mm2=data2=(02 12 22 32) + packssdw mm3,mm7 ; mm3=data4=(04 14 24 34) + + movhlps xmm4,xmm5 + movhlps xmm6,xmm0 + cvtps2pi mm5,xmm5 ; round to int32, mm5=data5L=(05 15) + cvtps2pi mm4,xmm0 ; round to int32, mm4=data3L=(03 13) + cvtps2pi mm6,xmm4 ; round to int32, mm6=data5H=(25 35) + cvtps2pi mm7,xmm6 ; round to int32, mm7=data3H=(23 33) + packssdw mm5,mm6 ; mm5=data5=(05 15 25 35) + packssdw mm4,mm7 ; mm4=data3=(03 13 23 33) + + movq mm6,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP] + + packsswb mm2,mm3 ; mm2=(02 12 22 32 04 14 24 34) + packsswb mm4,mm5 ; mm4=(03 13 23 33 05 15 25 35) + + paddb mm0,mm6 + paddb mm1,mm6 + paddb mm2,mm6 + paddb mm4,mm6 + + movq mm7,mm0 ; transpose coefficients(phase 1) + punpcklbw mm0,mm1 ; mm0=(00 01 10 11 20 21 30 31) + punpckhbw mm7,mm1 ; mm7=(06 07 16 17 26 27 36 37) + movq mm3,mm2 ; transpose coefficients(phase 1) + punpcklbw mm2,mm4 ; mm2=(02 03 12 13 22 23 32 33) + punpckhbw mm3,mm4 ; mm3=(04 05 14 15 24 25 34 35) + + movq mm5,mm0 ; transpose coefficients(phase 2) + punpcklwd mm0,mm2 ; mm0=(00 01 02 03 10 11 12 13) + punpckhwd mm5,mm2 ; mm5=(20 21 22 23 30 31 32 33) + movq mm6,mm3 ; transpose coefficients(phase 2) + punpcklwd mm3,mm7 ; mm3=(04 05 06 07 14 15 16 17) + punpckhwd mm6,mm7 ; mm6=(24 25 26 27 34 35 36 37) + + movq mm1,mm0 ; transpose coefficients(phase 3) + punpckldq mm0,mm3 ; mm0=(00 01 02 03 04 05 06 07) + punpckhdq mm1,mm3 ; mm1=(10 11 12 13 14 15 16 17) + movq mm4,mm5 ; transpose coefficients(phase 3) + punpckldq mm5,mm6 ; mm5=(20 21 22 23 24 25 26 27) + punpckhdq mm4,mm6 ; mm4=(30 31 32 33 34 35 36 37) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1 + mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4 + + poppic ebx ; restore GOT address + + add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr + add edi, byte 4*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jidctflt-sse2-64.asm b/media/libjpeg/simd/jidctflt-sse2-64.asm new file mode 100644 index 000000000..bdda05d97 --- /dev/null +++ b/media/libjpeg/simd/jidctflt-sse2-64.asm @@ -0,0 +1,482 @@ +; +; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the inverse DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jidctflt.c; see the jidctflt.c for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1,%2,0x44 +%endmacro + +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1,%2,0xEE +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_idct_float_sse2) + +EXTN(jconst_idct_float_sse2): + +PD_1_414 times 4 dd 1.414213562373095048801689 +PD_1_847 times 4 dd 1.847759065022573512256366 +PD_1_082 times 4 dd 1.082392200292393968799446 +PD_M2_613 times 4 dd -2.613125929752753055713286 +PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +; r10 = void *dct_table +; r11 = JCOEFPTR coef_block +; r12 = JSAMPARRAY output_buf +; r13 = JDIMENSION output_col + +%define original_rbp rbp+0 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT + ; FAST_FLOAT workspace[DCTSIZE2] + + align 16 + global EXTN(jsimd_idct_float_sse2) + +EXTN(jsimd_idct_float_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [workspace] + collect_args + push rbx + + ; ---- Pass 1: process columns from input, store into work array. + + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr + lea rdi, [workspace] ; FAST_FLOAT *wsptr + mov rcx, DCTSIZE/4 ; ctr +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE + mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + por xmm1,xmm2 + por xmm3,xmm4 + por xmm5,xmm6 + por xmm1,xmm3 + por xmm5,xmm7 + por xmm1,xmm5 + packsswb xmm1,xmm1 + movd eax,xmm1 + test rax,rax + jnz short .columnDCT + + ; -- AC terms all zero + + movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm1,xmm0 + movaps xmm2,xmm0 + movaps xmm3,xmm0 + + shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) + shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) + shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) + shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) + + movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 + jmp near .nextcolumn +%endif +.columnDCT: + + ; -- Even part + + movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23) + psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) + cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23) + + punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43) + punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63) + psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) + psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) + cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43) + cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4,xmm0 + movaps xmm5,xmm1 + subps xmm0,xmm2 ; xmm0=tmp11 + subps xmm1,xmm3 + addps xmm4,xmm2 ; xmm4=tmp10 + addps xmm5,xmm3 ; xmm5=tmp13 + + mulps xmm1,[rel PD_1_414] + subps xmm1,xmm5 ; xmm1=tmp12 + + movaps xmm6,xmm4 + movaps xmm7,xmm0 + subps xmm4,xmm5 ; xmm4=tmp3 + subps xmm0,xmm1 ; xmm0=tmp2 + addps xmm6,xmm5 ; xmm6=tmp0 + addps xmm7,xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + + punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13) + punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33) + psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) + psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) + cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13) + cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33) + + punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53) + punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73) + psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) + psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) + cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53) + cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73) + + mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4,xmm2 + movaps xmm0,xmm5 + addps xmm2,xmm1 ; xmm2=z11 + addps xmm5,xmm3 ; xmm5=z13 + subps xmm4,xmm1 ; xmm4=z12 + subps xmm0,xmm3 ; xmm0=z10 + + movaps xmm1,xmm2 + subps xmm2,xmm5 + addps xmm1,xmm5 ; xmm1=tmp7 + + mulps xmm2,[rel PD_1_414] ; xmm2=tmp11 + + movaps xmm3,xmm0 + addps xmm0,xmm4 + mulps xmm0,[rel PD_1_847] ; xmm0=z5 + mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930) + mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200) + addps xmm3,xmm0 ; xmm3=tmp12 + subps xmm4,xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3,xmm1 ; xmm3=tmp6 + movaps xmm5,xmm6 + movaps xmm0,xmm7 + addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) + addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) + subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) + subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) + subps xmm2,xmm3 ; xmm2=tmp5 + + movaps xmm1,xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) + movaps xmm3,xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) + unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 + + movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) + movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) + + addps xmm4,xmm2 ; xmm4=tmp4 + movaps xmm0,xmm7 + movaps xmm3,xmm5 + addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) + addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) + subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) + subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) + + movaps xmm2,xmm7 ; transpose coefficients(phase 1) + unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) + unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) + movaps xmm4,xmm5 ; transpose coefficients(phase 1) + unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) + unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) + + movaps xmm3,xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) + unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) + movaps xmm0,xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) + unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) + movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) + + movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 + + movaps xmm6,xmm5 ; transpose coefficients(phase 2) + unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) + unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) + movaps xmm3,xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) + unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) + + movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4 + movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 + +.nextcolumn: + add rsi, byte 4*SIZEOF_JCOEF ; coef_block + add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr + add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr + dec rcx ; ctr + jnz near .columnloop + + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov rax, [original_rbp] + lea rsi, [workspace] ; FAST_FLOAT *wsptr + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d + mov rcx, DCTSIZE/4 ; ctr +.rowloop: + + ; -- Even part + + movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)] + + movaps xmm4,xmm0 + movaps xmm5,xmm1 + subps xmm0,xmm2 ; xmm0=tmp11 + subps xmm1,xmm3 + addps xmm4,xmm2 ; xmm4=tmp10 + addps xmm5,xmm3 ; xmm5=tmp13 + + mulps xmm1,[rel PD_1_414] + subps xmm1,xmm5 ; xmm1=tmp12 + + movaps xmm6,xmm4 + movaps xmm7,xmm0 + subps xmm4,xmm5 ; xmm4=tmp3 + subps xmm0,xmm1 ; xmm0=tmp2 + addps xmm6,xmm5 ; xmm6=tmp0 + addps xmm7,xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)] + + movaps xmm4,xmm2 + movaps xmm0,xmm5 + addps xmm2,xmm1 ; xmm2=z11 + addps xmm5,xmm3 ; xmm5=z13 + subps xmm4,xmm1 ; xmm4=z12 + subps xmm0,xmm3 ; xmm0=z10 + + movaps xmm1,xmm2 + subps xmm2,xmm5 + addps xmm1,xmm5 ; xmm1=tmp7 + + mulps xmm2,[rel PD_1_414] ; xmm2=tmp11 + + movaps xmm3,xmm0 + addps xmm0,xmm4 + mulps xmm0,[rel PD_1_847] ; xmm0=z5 + mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930) + mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200) + addps xmm3,xmm0 ; xmm3=tmp12 + subps xmm4,xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3,xmm1 ; xmm3=tmp6 + movaps xmm5,xmm6 + movaps xmm0,xmm7 + addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) + addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) + subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) + subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) + subps xmm2,xmm3 ; xmm2=tmp5 + + movaps xmm1,[rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC] + pcmpeqd xmm3,xmm3 + psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) + addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) + addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) + addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) + + pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) + pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) + pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) + pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) + por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31) + por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37) + + movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 + movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 + + addps xmm4,xmm2 ; xmm4=tmp4 + movaps xmm7,xmm1 + movaps xmm5,xmm3 + addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32) + addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34) + subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35) + subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33) + + movaps xmm2,[rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC] + pcmpeqd xmm4,xmm4 + psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) + addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) + addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) + addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) + + pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) + pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) + pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) + pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) + por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35) + por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33) + + movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] + + packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) + packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) + paddb xmm6,xmm2 + paddb xmm1,xmm2 + + movdqa xmm4,xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + + movdqa xmm7,xmm6 ; transpose coefficients(phase 3) + punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + + pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7 + mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 + movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3 + + add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr + add rdi, byte 4*SIZEOF_JSAMPROW + dec rcx ; ctr + jnz near .rowloop + + pop rbx + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jidctflt-sse2.asm b/media/libjpeg/simd/jidctflt-sse2.asm new file mode 100644 index 000000000..a15a9c111 --- /dev/null +++ b/media/libjpeg/simd/jidctflt-sse2.asm @@ -0,0 +1,497 @@ +; +; jidctflt.asm - floating-point IDCT (SSE & SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a floating-point implementation of the inverse DCT +; (Discrete Cosine Transform). The following code is based directly on +; the IJG's original jidctflt.c; see the jidctflt.c for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1,%2,0x44 +%endmacro + +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1,%2,0xEE +%endmacro + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_idct_float_sse2) + +EXTN(jconst_idct_float_sse2): + +PD_1_414 times 4 dd 1.414213562373095048801689 +PD_1_847 times 4 dd 1.847759065022573512256366 +PD_1_082 times 4 dd 1.082392200292393968799446 +PD_M2_613 times 4 dd -2.613125929752753055713286 +PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b)+8 ; void *dct_table +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT + ; FAST_FLOAT workspace[DCTSIZE2] + + align 16 + global EXTN(jsimd_idct_float_sse2) + +EXTN(jsimd_idct_float_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; FAST_FLOAT *wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm1,xmm2 + por xmm3,xmm4 + por xmm5,xmm6 + por xmm1,xmm3 + por xmm5,xmm7 + por xmm1,xmm5 + packsswb xmm1,xmm1 + movd eax,xmm1 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm1,xmm0 + movaps xmm2,xmm0 + movaps xmm3,xmm0 + + shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) + shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) + shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) + shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 + jmp near .nextcolumn + alignx 16,7 +%endif +.columnDCT: + + ; -- Even part + + movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23) + psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) + cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23) + + punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43) + punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63) + psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) + psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) + cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43) + cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4,xmm0 + movaps xmm5,xmm1 + subps xmm0,xmm2 ; xmm0=tmp11 + subps xmm1,xmm3 + addps xmm4,xmm2 ; xmm4=tmp10 + addps xmm5,xmm3 ; xmm5=tmp13 + + mulps xmm1,[GOTOFF(ebx,PD_1_414)] + subps xmm1,xmm5 ; xmm1=tmp12 + + movaps xmm6,xmm4 + movaps xmm7,xmm0 + subps xmm4,xmm5 ; xmm4=tmp3 + subps xmm0,xmm1 ; xmm0=tmp2 + addps xmm6,xmm5 ; xmm6=tmp0 + addps xmm7,xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13) + punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33) + psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) + psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) + cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13) + cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33) + + punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53) + punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73) + psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) + psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) + cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53) + cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73) + + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4,xmm2 + movaps xmm0,xmm5 + addps xmm2,xmm1 ; xmm2=z11 + addps xmm5,xmm3 ; xmm5=z13 + subps xmm4,xmm1 ; xmm4=z12 + subps xmm0,xmm3 ; xmm0=z10 + + movaps xmm1,xmm2 + subps xmm2,xmm5 + addps xmm1,xmm5 ; xmm1=tmp7 + + mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 + + movaps xmm3,xmm0 + addps xmm0,xmm4 + mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 + mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) + mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) + addps xmm3,xmm0 ; xmm3=tmp12 + subps xmm4,xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3,xmm1 ; xmm3=tmp6 + movaps xmm5,xmm6 + movaps xmm0,xmm7 + addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) + addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) + subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) + subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) + subps xmm2,xmm3 ; xmm2=tmp5 + + movaps xmm1,xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) + movaps xmm3,xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) + unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 + + movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) + movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) + + addps xmm4,xmm2 ; xmm4=tmp4 + movaps xmm0,xmm7 + movaps xmm3,xmm5 + addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) + addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) + subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) + subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) + + movaps xmm2,xmm7 ; transpose coefficients(phase 1) + unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) + unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) + movaps xmm4,xmm5 ; transpose coefficients(phase 1) + unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) + unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) + + movaps xmm3,xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) + unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) + movaps xmm0,xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) + unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) + movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + + movaps xmm6,xmm5 ; transpose coefficients(phase 2) + unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) + unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) + movaps xmm3,xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) + unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) + + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 + movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 + +.nextcolumn: + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr + add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; FAST_FLOAT *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 +.rowloop: + + ; -- Even part + + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] + + movaps xmm4,xmm0 + movaps xmm5,xmm1 + subps xmm0,xmm2 ; xmm0=tmp11 + subps xmm1,xmm3 + addps xmm4,xmm2 ; xmm4=tmp10 + addps xmm5,xmm3 ; xmm5=tmp13 + + mulps xmm1,[GOTOFF(ebx,PD_1_414)] + subps xmm1,xmm5 ; xmm1=tmp12 + + movaps xmm6,xmm4 + movaps xmm7,xmm0 + subps xmm4,xmm5 ; xmm4=tmp3 + subps xmm0,xmm1 ; xmm0=tmp2 + addps xmm6,xmm5 ; xmm6=tmp0 + addps xmm7,xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] + + movaps xmm4,xmm2 + movaps xmm0,xmm5 + addps xmm2,xmm1 ; xmm2=z11 + addps xmm5,xmm3 ; xmm5=z13 + subps xmm4,xmm1 ; xmm4=z12 + subps xmm0,xmm3 ; xmm0=z10 + + movaps xmm1,xmm2 + subps xmm2,xmm5 + addps xmm1,xmm5 ; xmm1=tmp7 + + mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 + + movaps xmm3,xmm0 + addps xmm0,xmm4 + mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 + mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) + mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) + addps xmm3,xmm0 ; xmm3=tmp12 + subps xmm4,xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3,xmm1 ; xmm3=tmp6 + movaps xmm5,xmm6 + movaps xmm0,xmm7 + addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) + addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) + subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) + subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) + subps xmm2,xmm3 ; xmm2=tmp5 + + movaps xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC] + pcmpeqd xmm3,xmm3 + psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) + addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) + addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) + addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) + + pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) + pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) + pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) + pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) + por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31) + por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37) + + movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 + movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 + + addps xmm4,xmm2 ; xmm4=tmp4 + movaps xmm7,xmm1 + movaps xmm5,xmm3 + addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32) + addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34) + subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35) + subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33) + + movaps xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC] + pcmpeqd xmm4,xmm4 + psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) + addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) + addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) + addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) + + pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) + pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) + pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) + pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) + por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35) + por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33) + + movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] + + packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) + packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) + paddb xmm6,xmm2 + paddb xmm1,xmm2 + + movdqa xmm4,xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + + movdqa xmm7,xmm6 ; transpose coefficients(phase 3) + punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + + pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7 + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 + movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3 + + poppic ebx ; restore GOT address + + add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr + add edi, byte 4*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jidctfst-altivec.c b/media/libjpeg/simd/jidctfst-altivec.c new file mode 100644 index 000000000..ec30c3995 --- /dev/null +++ b/media/libjpeg/simd/jidctfst-altivec.c @@ -0,0 +1,257 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* FAST INTEGER INVERSE DCT + * + * This is similar to the SSE2 implementation, except that we left-shift the + * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because + * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of: + * the elements in arg3 + the most significant 17 bits of + * (the elements in arg1 * the elements in arg2). + */ + +#include "jsimd_altivec.h" + + +#define F_1_082 277 /* FIX(1.082392200) */ +#define F_1_414 362 /* FIX(1.414213562) */ +#define F_1_847 473 /* FIX(1.847759065) */ +#define F_2_613 669 /* FIX(2.613125930) */ +#define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */ + +#define CONST_BITS 8 +#define PASS1_BITS 2 +#define PRE_MULTIPLY_SCALE_BITS 2 +#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1) + + +#define DO_IDCT(in) \ +{ \ + /* Even part */ \ + \ + tmp10 = vec_add(in##0, in##4); \ + tmp11 = vec_sub(in##0, in##4); \ + tmp13 = vec_add(in##2, in##6); \ + \ + tmp12 = vec_sub(in##2, in##6); \ + tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \ + tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \ + tmp12 = vec_sub(tmp12, tmp13); \ + \ + tmp0 = vec_add(tmp10, tmp13); \ + tmp3 = vec_sub(tmp10, tmp13); \ + tmp1 = vec_add(tmp11, tmp12); \ + tmp2 = vec_sub(tmp11, tmp12); \ + \ + /* Odd part */ \ + \ + z13 = vec_add(in##5, in##3); \ + z10 = vec_sub(in##5, in##3); \ + z10s = vec_sl(z10, pre_multiply_scale_bits); \ + z11 = vec_add(in##1, in##7); \ + z12s = vec_sub(in##1, in##7); \ + z12s = vec_sl(z12s, pre_multiply_scale_bits); \ + \ + tmp11 = vec_sub(z11, z13); \ + tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \ + tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \ + \ + tmp7 = vec_add(z11, z13); \ + \ + /* To avoid overflow... \ + * \ + * (Original) \ + * tmp12 = -2.613125930 * z10 + z5; \ + * \ + * (This implementation) \ + * tmp12 = (-1.613125930 - 1) * z10 + z5; \ + * = -1.613125930 * z10 - z10 + z5; \ + */ \ + \ + z5 = vec_add(z10s, z12s); \ + z5 = vec_madds(z5, pw_F1847, pw_zero); \ + \ + tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \ + tmp10 = vec_sub(tmp10, z5); \ + tmp12 = vec_madds(z10s, pw_MF1613, z5); \ + tmp12 = vec_sub(tmp12, z10); \ + \ + tmp6 = vec_sub(tmp12, tmp7); \ + tmp5 = vec_sub(tmp11, tmp6); \ + tmp4 = vec_add(tmp10, tmp5); \ + \ + out0 = vec_add(tmp0, tmp7); \ + out1 = vec_add(tmp1, tmp6); \ + out2 = vec_add(tmp2, tmp5); \ + out3 = vec_sub(tmp3, tmp4); \ + out4 = vec_add(tmp3, tmp4); \ + out5 = vec_sub(tmp2, tmp5); \ + out6 = vec_sub(tmp1, tmp6); \ + out7 = vec_sub(tmp0, tmp7); \ +} + + +void +jsimd_idct_ifast_altivec (void *dct_table_, JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + short *dct_table = (short *)dct_table_; + int *outptr; + + __vector short row0, row1, row2, row3, row4, row5, row6, row7, + col0, col1, col2, col3, col4, col5, col6, col7, + quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7, + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, + z5, z10, z10s, z11, z12s, z13, + out0, out1, out2, out3, out4, out5, out6, out7; + __vector signed char outb; + + /* Constants */ + __vector short pw_zero = { __8X(0) }, + pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) }, + pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) }, + pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) }, + pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) }; + __vector unsigned short + pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) }, + pass1_bits3 = { __8X(PASS1_BITS + 3) }; + __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) }; + + /* Pass 1: process columns */ + + col0 = vec_ld(0, coef_block); + col1 = vec_ld(16, coef_block); + col2 = vec_ld(32, coef_block); + col3 = vec_ld(48, coef_block); + col4 = vec_ld(64, coef_block); + col5 = vec_ld(80, coef_block); + col6 = vec_ld(96, coef_block); + col7 = vec_ld(112, coef_block); + + tmp1 = vec_or(col1, col2); + tmp2 = vec_or(col3, col4); + tmp1 = vec_or(tmp1, tmp2); + tmp3 = vec_or(col5, col6); + tmp3 = vec_or(tmp3, col7); + tmp1 = vec_or(tmp1, tmp3); + + quant0 = vec_ld(0, dct_table); + col0 = vec_mladd(col0, quant0, pw_zero); + + if (vec_all_eq(tmp1, pw_zero)) { + /* AC terms all zero */ + + row0 = vec_splat(col0, 0); + row1 = vec_splat(col0, 1); + row2 = vec_splat(col0, 2); + row3 = vec_splat(col0, 3); + row4 = vec_splat(col0, 4); + row5 = vec_splat(col0, 5); + row6 = vec_splat(col0, 6); + row7 = vec_splat(col0, 7); + + } else { + + quant1 = vec_ld(16, dct_table); + quant2 = vec_ld(32, dct_table); + quant3 = vec_ld(48, dct_table); + quant4 = vec_ld(64, dct_table); + quant5 = vec_ld(80, dct_table); + quant6 = vec_ld(96, dct_table); + quant7 = vec_ld(112, dct_table); + + col1 = vec_mladd(col1, quant1, pw_zero); + col2 = vec_mladd(col2, quant2, pw_zero); + col3 = vec_mladd(col3, quant3, pw_zero); + col4 = vec_mladd(col4, quant4, pw_zero); + col5 = vec_mladd(col5, quant5, pw_zero); + col6 = vec_mladd(col6, quant6, pw_zero); + col7 = vec_mladd(col7, quant7, pw_zero); + + DO_IDCT(col); + + TRANSPOSE(out, row); + } + + /* Pass 2: process rows */ + + DO_IDCT(row); + + out0 = vec_sra(out0, pass1_bits3); + out1 = vec_sra(out1, pass1_bits3); + out2 = vec_sra(out2, pass1_bits3); + out3 = vec_sra(out3, pass1_bits3); + out4 = vec_sra(out4, pass1_bits3); + out5 = vec_sra(out5, pass1_bits3); + out6 = vec_sra(out6, pass1_bits3); + out7 = vec_sra(out7, pass1_bits3); + + TRANSPOSE(out, col); + + outb = vec_packs(col0, col0); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[0] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col1, col1); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[1] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col2, col2); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[2] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col3, col3); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[3] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col4, col4); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[4] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col5, col5); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[5] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col6, col6); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[6] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col7, col7); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[7] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); +} diff --git a/media/libjpeg/simd/jidctfst-mmx.asm b/media/libjpeg/simd/jidctfst-mmx.asm new file mode 100644 index 000000000..6e95bfbca --- /dev/null +++ b/media/libjpeg/simd/jidctfst-mmx.asm @@ -0,0 +1,499 @@ +; +; jidctfst.asm - fast integer IDCT (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a fast, not so accurate integer implementation of +; the inverse DCT (Discrete Cosine Transform). The following code is +; based directly on the IJG's original jidctfst.c; see the jidctfst.c +; for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 8 ; 14 is also OK. +%define PASS1_BITS 2 + +%if IFAST_SCALE_BITS != PASS1_BITS +%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." +%endif + +%if CONST_BITS == 8 +F_1_082 equ 277 ; FIX(1.082392200) +F_1_414 equ 362 ; FIX(1.414213562) +F_1_847 equ 473 ; FIX(1.847759065) +F_2_613 equ 669 ; FIX(2.613125930) +F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) +F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) +F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 16 + global EXTN(jconst_idct_ifast_mmx) + +EXTN(jconst_idct_ifast_mmx): + +PW_F1414 times 4 dw F_1_414 << CONST_SHIFT +PW_F1847 times 4 dw F_1_847 << CONST_SHIFT +PW_MF1613 times 4 dw -F_1_613 << CONST_SHIFT +PW_F1082 times 4 dw F_1_082 << CONST_SHIFT +PB_CENTERJSAMP times 8 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_ifast_mmx (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b)+8 ; jpeg_component_info *compptr +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF + ; JCOEF workspace[DCTSIZE2] + + align 16 + global EXTN(jsimd_idct_ifast_mmx) + +EXTN(jsimd_idct_ifast_mmx): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; JCOEF *wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por mm1,mm0 + packsswb mm1,mm1 + movd eax,mm1 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movq mm2,mm0 ; mm0=in0=(00 01 02 03) + punpcklwd mm0,mm0 ; mm0=(00 00 01 01) + punpckhwd mm2,mm2 ; mm2=(02 02 03 03) + + movq mm1,mm0 + punpckldq mm0,mm0 ; mm0=(00 00 00 00) + punpckhdq mm1,mm1 ; mm1=(01 01 01 01) + movq mm3,mm2 + punpckldq mm2,mm2 ; mm2=(02 02 02 02) + punpckhdq mm3,mm3 ; mm3=(03 03 03 03) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3 + jmp near .nextcolumn + alignx 16,7 +%endif +.columnDCT: + + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movq mm4,mm0 + movq mm5,mm1 + psubw mm0,mm2 ; mm0=tmp11 + psubw mm1,mm3 + paddw mm4,mm2 ; mm4=tmp10 + paddw mm5,mm3 ; mm5=tmp13 + + psllw mm1,PRE_MULTIPLY_SCALE_BITS + pmulhw mm1,[GOTOFF(ebx,PW_F1414)] + psubw mm1,mm5 ; mm1=tmp12 + + movq mm6,mm4 + movq mm7,mm0 + psubw mm4,mm5 ; mm4=tmp3 + psubw mm0,mm1 ; mm0=tmp2 + paddw mm6,mm5 ; mm6=tmp0 + paddw mm7,mm1 ; mm7=tmp1 + + movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3 + movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2 + + ; -- Odd part + + movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)] + movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movq mm4,mm2 + movq mm0,mm5 + psubw mm2,mm1 ; mm2=z12 + psubw mm5,mm3 ; mm5=z10 + paddw mm4,mm1 ; mm4=z11 + paddw mm0,mm3 ; mm0=z13 + + movq mm1,mm5 ; mm1=z10(unscaled) + psllw mm2,PRE_MULTIPLY_SCALE_BITS + psllw mm5,PRE_MULTIPLY_SCALE_BITS + + movq mm3,mm4 + psubw mm4,mm0 + paddw mm3,mm0 ; mm3=tmp7 + + psllw mm4,PRE_MULTIPLY_SCALE_BITS + pmulhw mm4,[GOTOFF(ebx,PW_F1414)] ; mm4=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movq mm0,mm5 + paddw mm5,mm2 + pmulhw mm5,[GOTOFF(ebx,PW_F1847)] ; mm5=z5 + pmulhw mm0,[GOTOFF(ebx,PW_MF1613)] + pmulhw mm2,[GOTOFF(ebx,PW_F1082)] + psubw mm0,mm1 + psubw mm2,mm5 ; mm2=tmp10 + paddw mm0,mm5 ; mm0=tmp12 + + ; -- Final output stage + + psubw mm0,mm3 ; mm0=tmp6 + movq mm1,mm6 + movq mm5,mm7 + paddw mm6,mm3 ; mm6=data0=(00 01 02 03) + paddw mm7,mm0 ; mm7=data1=(10 11 12 13) + psubw mm1,mm3 ; mm1=data7=(70 71 72 73) + psubw mm5,mm0 ; mm5=data6=(60 61 62 63) + psubw mm4,mm0 ; mm4=tmp5 + + movq mm3,mm6 ; transpose coefficients(phase 1) + punpcklwd mm6,mm7 ; mm6=(00 10 01 11) + punpckhwd mm3,mm7 ; mm3=(02 12 03 13) + movq mm0,mm5 ; transpose coefficients(phase 1) + punpcklwd mm5,mm1 ; mm5=(60 70 61 71) + punpckhwd mm0,mm1 ; mm0=(62 72 63 73) + + movq mm7, MMWORD [wk(0)] ; mm7=tmp2 + movq mm1, MMWORD [wk(1)] ; mm1=tmp3 + + movq MMWORD [wk(0)], mm5 ; wk(0)=(60 70 61 71) + movq MMWORD [wk(1)], mm0 ; wk(1)=(62 72 63 73) + + paddw mm2,mm4 ; mm2=tmp4 + movq mm5,mm7 + movq mm0,mm1 + paddw mm7,mm4 ; mm7=data2=(20 21 22 23) + paddw mm1,mm2 ; mm1=data4=(40 41 42 43) + psubw mm5,mm4 ; mm5=data5=(50 51 52 53) + psubw mm0,mm2 ; mm0=data3=(30 31 32 33) + + movq mm4,mm7 ; transpose coefficients(phase 1) + punpcklwd mm7,mm0 ; mm7=(20 30 21 31) + punpckhwd mm4,mm0 ; mm4=(22 32 23 33) + movq mm2,mm1 ; transpose coefficients(phase 1) + punpcklwd mm1,mm5 ; mm1=(40 50 41 51) + punpckhwd mm2,mm5 ; mm2=(42 52 43 53) + + movq mm0,mm6 ; transpose coefficients(phase 2) + punpckldq mm6,mm7 ; mm6=(00 10 20 30) + punpckhdq mm0,mm7 ; mm0=(01 11 21 31) + movq mm5,mm3 ; transpose coefficients(phase 2) + punpckldq mm3,mm4 ; mm3=(02 12 22 32) + punpckhdq mm5,mm4 ; mm5=(03 13 23 33) + + movq mm7, MMWORD [wk(0)] ; mm7=(60 70 61 71) + movq mm4, MMWORD [wk(1)] ; mm4=(62 72 63 73) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5 + + movq mm6,mm1 ; transpose coefficients(phase 2) + punpckldq mm1,mm7 ; mm1=(40 50 60 70) + punpckhdq mm6,mm7 ; mm6=(41 51 61 71) + movq mm0,mm2 ; transpose coefficients(phase 2) + punpckldq mm2,mm4 ; mm2=(42 52 62 72) + punpckhdq mm0,mm4 ; mm0=(43 53 63 73) + + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0 + +.nextcolumn: + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_IFAST_MULT_TYPE ; quantptr + add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; JCOEF *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 +.rowloop: + + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + movq mm4,mm0 + movq mm5,mm1 + psubw mm0,mm2 ; mm0=tmp11 + psubw mm1,mm3 + paddw mm4,mm2 ; mm4=tmp10 + paddw mm5,mm3 ; mm5=tmp13 + + psllw mm1,PRE_MULTIPLY_SCALE_BITS + pmulhw mm1,[GOTOFF(ebx,PW_F1414)] + psubw mm1,mm5 ; mm1=tmp12 + + movq mm6,mm4 + movq mm7,mm0 + psubw mm4,mm5 ; mm4=tmp3 + psubw mm0,mm1 ; mm0=tmp2 + paddw mm6,mm5 ; mm6=tmp0 + paddw mm7,mm1 ; mm7=tmp1 + + movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3 + movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2 + + ; -- Odd part + + movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + movq mm4,mm2 + movq mm0,mm5 + psubw mm2,mm1 ; mm2=z12 + psubw mm5,mm3 ; mm5=z10 + paddw mm4,mm1 ; mm4=z11 + paddw mm0,mm3 ; mm0=z13 + + movq mm1,mm5 ; mm1=z10(unscaled) + psllw mm2,PRE_MULTIPLY_SCALE_BITS + psllw mm5,PRE_MULTIPLY_SCALE_BITS + + movq mm3,mm4 + psubw mm4,mm0 + paddw mm3,mm0 ; mm3=tmp7 + + psllw mm4,PRE_MULTIPLY_SCALE_BITS + pmulhw mm4,[GOTOFF(ebx,PW_F1414)] ; mm4=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movq mm0,mm5 + paddw mm5,mm2 + pmulhw mm5,[GOTOFF(ebx,PW_F1847)] ; mm5=z5 + pmulhw mm0,[GOTOFF(ebx,PW_MF1613)] + pmulhw mm2,[GOTOFF(ebx,PW_F1082)] + psubw mm0,mm1 + psubw mm2,mm5 ; mm2=tmp10 + paddw mm0,mm5 ; mm0=tmp12 + + ; -- Final output stage + + psubw mm0,mm3 ; mm0=tmp6 + movq mm1,mm6 + movq mm5,mm7 + paddw mm6,mm3 ; mm6=data0=(00 10 20 30) + paddw mm7,mm0 ; mm7=data1=(01 11 21 31) + psraw mm6,(PASS1_BITS+3) ; descale + psraw mm7,(PASS1_BITS+3) ; descale + psubw mm1,mm3 ; mm1=data7=(07 17 27 37) + psubw mm5,mm0 ; mm5=data6=(06 16 26 36) + psraw mm1,(PASS1_BITS+3) ; descale + psraw mm5,(PASS1_BITS+3) ; descale + psubw mm4,mm0 ; mm4=tmp5 + + packsswb mm6,mm5 ; mm6=(00 10 20 30 06 16 26 36) + packsswb mm7,mm1 ; mm7=(01 11 21 31 07 17 27 37) + + movq mm3, MMWORD [wk(0)] ; mm3=tmp2 + movq mm0, MMWORD [wk(1)] ; mm0=tmp3 + + paddw mm2,mm4 ; mm2=tmp4 + movq mm5,mm3 + movq mm1,mm0 + paddw mm3,mm4 ; mm3=data2=(02 12 22 32) + paddw mm0,mm2 ; mm0=data4=(04 14 24 34) + psraw mm3,(PASS1_BITS+3) ; descale + psraw mm0,(PASS1_BITS+3) ; descale + psubw mm5,mm4 ; mm5=data5=(05 15 25 35) + psubw mm1,mm2 ; mm1=data3=(03 13 23 33) + psraw mm5,(PASS1_BITS+3) ; descale + psraw mm1,(PASS1_BITS+3) ; descale + + movq mm4,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm4=[PB_CENTERJSAMP] + + packsswb mm3,mm0 ; mm3=(02 12 22 32 04 14 24 34) + packsswb mm1,mm5 ; mm1=(03 13 23 33 05 15 25 35) + + paddb mm6,mm4 + paddb mm7,mm4 + paddb mm3,mm4 + paddb mm1,mm4 + + movq mm2,mm6 ; transpose coefficients(phase 1) + punpcklbw mm6,mm7 ; mm6=(00 01 10 11 20 21 30 31) + punpckhbw mm2,mm7 ; mm2=(06 07 16 17 26 27 36 37) + movq mm0,mm3 ; transpose coefficients(phase 1) + punpcklbw mm3,mm1 ; mm3=(02 03 12 13 22 23 32 33) + punpckhbw mm0,mm1 ; mm0=(04 05 14 15 24 25 34 35) + + movq mm5,mm6 ; transpose coefficients(phase 2) + punpcklwd mm6,mm3 ; mm6=(00 01 02 03 10 11 12 13) + punpckhwd mm5,mm3 ; mm5=(20 21 22 23 30 31 32 33) + movq mm4,mm0 ; transpose coefficients(phase 2) + punpcklwd mm0,mm2 ; mm0=(04 05 06 07 14 15 16 17) + punpckhwd mm4,mm2 ; mm4=(24 25 26 27 34 35 36 37) + + movq mm7,mm6 ; transpose coefficients(phase 3) + punpckldq mm6,mm0 ; mm6=(00 01 02 03 04 05 06 07) + punpckhdq mm7,mm0 ; mm7=(10 11 12 13 14 15 16 17) + movq mm1,mm5 ; transpose coefficients(phase 3) + punpckldq mm5,mm4 ; mm5=(20 21 22 23 24 25 26 27) + punpckhdq mm1,mm4 ; mm1=(30 31 32 33 34 35 36 37) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7 + mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1 + + poppic ebx ; restore GOT address + + add esi, byte 4*SIZEOF_JCOEF ; wsptr + add edi, byte 4*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jidctfst-sse2-64.asm b/media/libjpeg/simd/jidctfst-sse2-64.asm new file mode 100644 index 000000000..48846426d --- /dev/null +++ b/media/libjpeg/simd/jidctfst-sse2-64.asm @@ -0,0 +1,491 @@ +; +; jidctfst.asm - fast integer IDCT (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a fast, not so accurate integer implementation of +; the inverse DCT (Discrete Cosine Transform). The following code is +; based directly on the IJG's original jidctfst.c; see the jidctfst.c +; for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 8 ; 14 is also OK. +%define PASS1_BITS 2 + +%if IFAST_SCALE_BITS != PASS1_BITS +%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." +%endif + +%if CONST_BITS == 8 +F_1_082 equ 277 ; FIX(1.082392200) +F_1_414 equ 362 ; FIX(1.414213562) +F_1_847 equ 473 ; FIX(1.847759065) +F_2_613 equ 669 ; FIX(2.613125930) +F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) +F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) +F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 16 + global EXTN(jconst_idct_ifast_sse2) + +EXTN(jconst_idct_ifast_sse2): + +PW_F1414 times 8 dw F_1_414 << CONST_SHIFT +PW_F1847 times 8 dw F_1_847 << CONST_SHIFT +PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT +PW_F1082 times 8 dw F_1_082 << CONST_SHIFT +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_ifast_sse2 (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +; r10 = jpeg_component_info *compptr +; r11 = JCOEFPTR coef_block +; r12 = JSAMPARRAY output_buf +; r13 = JDIMENSION output_col + +%define original_rbp rbp+0 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + global EXTN(jsimd_idct_ifast_sse2) + +EXTN(jsimd_idct_ifast_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + + ; ---- Pass 1: process columns from input. + + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2 + mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + por xmm1,xmm0 + packsswb xmm1,xmm1 + packsswb xmm1,xmm1 + movd eax,xmm1 + test rax,rax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07) + + pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) + pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) + pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) + pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) + pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) + pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) + pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) + pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 + jmp near .column_end +%endif +.columnDCT: + + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4,xmm0 + movdqa xmm5,xmm1 + psubw xmm0,xmm2 ; xmm0=tmp11 + psubw xmm1,xmm3 + paddw xmm4,xmm2 ; xmm4=tmp10 + paddw xmm5,xmm3 ; xmm5=tmp13 + + psllw xmm1,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm1,[rel PW_F1414] + psubw xmm1,xmm5 ; xmm1=tmp12 + + movdqa xmm6,xmm4 + movdqa xmm7,xmm0 + psubw xmm4,xmm5 ; xmm4=tmp3 + psubw xmm0,xmm1 ; xmm0=tmp2 + paddw xmm6,xmm5 ; xmm6=tmp0 + paddw xmm7,xmm1 ; xmm7=tmp1 + + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 + + ; -- Odd part + + movdqa xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4,xmm2 + movdqa xmm0,xmm5 + psubw xmm2,xmm1 ; xmm2=z12 + psubw xmm5,xmm3 ; xmm5=z10 + paddw xmm4,xmm1 ; xmm4=z11 + paddw xmm0,xmm3 ; xmm0=z13 + + movdqa xmm1,xmm5 ; xmm1=z10(unscaled) + psllw xmm2,PRE_MULTIPLY_SCALE_BITS + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + + movdqa xmm3,xmm4 + psubw xmm4,xmm0 + paddw xmm3,xmm0 ; xmm3=tmp7 + + psllw xmm4,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm4,[rel PW_F1414] ; xmm4=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm0,xmm5 + paddw xmm5,xmm2 + pmulhw xmm5,[rel PW_F1847] ; xmm5=z5 + pmulhw xmm0,[rel PW_MF1613] + pmulhw xmm2,[rel PW_F1082] + psubw xmm0,xmm1 + psubw xmm2,xmm5 ; xmm2=tmp10 + paddw xmm0,xmm5 ; xmm0=tmp12 + + ; -- Final output stage + + psubw xmm0,xmm3 ; xmm0=tmp6 + movdqa xmm1,xmm6 + movdqa xmm5,xmm7 + paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) + paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) + psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) + psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) + psubw xmm4,xmm0 ; xmm4=tmp5 + + movdqa xmm3,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13) + punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17) + movdqa xmm0,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73) + punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) + + paddw xmm2,xmm4 ; xmm2=tmp4 + movdqa xmm5,xmm7 + movdqa xmm0,xmm1 + paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) + paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) + psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) + psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm4,xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33) + punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm2,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53) + punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57) + + movdqa xmm0,xmm3 ; transpose coefficients(phase 2) + punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35) + punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37) + movdqa xmm5,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31) + punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33) + + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) + + movdqa xmm3,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71) + punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73) + movdqa xmm0,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77) + + movdqa xmm4,xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) + movdqa xmm7,xmm5 ; transpose coefficients(phase 3) + punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 + + movdqa xmm4,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) + movdqa xmm7,xmm3 ; transpose coefficients(phase 3) + punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov rax, [original_rbp] + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d + + ; -- Even part + + ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 + + movdqa xmm2,xmm6 + movdqa xmm0,xmm5 + psubw xmm6,xmm1 ; xmm6=tmp11 + psubw xmm5,xmm3 + paddw xmm2,xmm1 ; xmm2=tmp10 + paddw xmm0,xmm3 ; xmm0=tmp13 + + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5,[rel PW_F1414] + psubw xmm5,xmm0 ; xmm5=tmp12 + + movdqa xmm1,xmm2 + movdqa xmm3,xmm6 + psubw xmm2,xmm0 ; xmm2=tmp3 + psubw xmm6,xmm5 ; xmm6=tmp2 + paddw xmm1,xmm0 ; xmm1=tmp0 + paddw xmm3,xmm5 ; xmm3=tmp1 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 + + ; -- Odd part + + ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 + + movdqa xmm2,xmm0 + movdqa xmm6,xmm4 + psubw xmm0,xmm7 ; xmm0=z12 + psubw xmm4,xmm5 ; xmm4=z10 + paddw xmm2,xmm7 ; xmm2=z11 + paddw xmm6,xmm5 ; xmm6=z13 + + movdqa xmm7,xmm4 ; xmm7=z10(unscaled) + psllw xmm0,PRE_MULTIPLY_SCALE_BITS + psllw xmm4,PRE_MULTIPLY_SCALE_BITS + + movdqa xmm5,xmm2 + psubw xmm2,xmm6 + paddw xmm5,xmm6 ; xmm5=tmp7 + + psllw xmm2,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm2,[rel PW_F1414] ; xmm2=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm6,xmm4 + paddw xmm4,xmm0 + pmulhw xmm4,[rel PW_F1847] ; xmm4=z5 + pmulhw xmm6,[rel PW_MF1613] + pmulhw xmm0,[rel PW_F1082] + psubw xmm6,xmm7 + psubw xmm0,xmm4 ; xmm0=tmp10 + paddw xmm6,xmm4 ; xmm6=tmp12 + + ; -- Final output stage + + psubw xmm6,xmm5 ; xmm6=tmp6 + movdqa xmm7,xmm1 + movdqa xmm4,xmm3 + paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) + paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) + psraw xmm1,(PASS1_BITS+3) ; descale + psraw xmm3,(PASS1_BITS+3) ; descale + psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) + psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) + psraw xmm7,(PASS1_BITS+3) ; descale + psraw xmm4,(PASS1_BITS+3) ; descale + psubw xmm2,xmm6 ; xmm2=tmp5 + + packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 + + paddw xmm0,xmm2 ; xmm0=tmp4 + movdqa xmm4,xmm5 + movdqa xmm7,xmm6 + paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) + paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) + psraw xmm5,(PASS1_BITS+3) ; descale + psraw xmm6,(PASS1_BITS+3) ; descale + psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) + psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) + psraw xmm4,(PASS1_BITS+3) ; descale + psraw xmm7,(PASS1_BITS+3) ; descale + + movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] + + packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm1,xmm2 + paddb xmm3,xmm2 + paddb xmm5,xmm2 + paddb xmm7,xmm2 + + movdqa xmm0,xmm1 ; transpose coefficients(phase 1) + punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm6,xmm5 ; transpose coefficients(phase 1) + punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4,xmm1 ; transpose coefficients(phase 2) + punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm2,xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm3,xmm1 ; transpose coefficients(phase 3) + punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm7,xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 + mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7 + + mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0 + mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 + + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jidctfst-sse2.asm b/media/libjpeg/simd/jidctfst-sse2.asm new file mode 100644 index 000000000..f591e55f0 --- /dev/null +++ b/media/libjpeg/simd/jidctfst-sse2.asm @@ -0,0 +1,501 @@ +; +; jidctfst.asm - fast integer IDCT (SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a fast, not so accurate integer implementation of +; the inverse DCT (Discrete Cosine Transform). The following code is +; based directly on the IJG's original jidctfst.c; see the jidctfst.c +; for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 8 ; 14 is also OK. +%define PASS1_BITS 2 + +%if IFAST_SCALE_BITS != PASS1_BITS +%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." +%endif + +%if CONST_BITS == 8 +F_1_082 equ 277 ; FIX(1.082392200) +F_1_414 equ 362 ; FIX(1.414213562) +F_1_847 equ 473 ; FIX(1.847759065) +F_2_613 equ 669 ; FIX(2.613125930) +F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) +F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) +F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 16 + global EXTN(jconst_idct_ifast_sse2) + +EXTN(jconst_idct_ifast_sse2): + +PW_F1414 times 8 dw F_1_414 << CONST_SHIFT +PW_F1847 times 8 dw F_1_847 << CONST_SHIFT +PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT +PW_F1082 times 8 dw F_1_082 << CONST_SHIFT +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_ifast_sse2 (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b)+8 ; jpeg_component_info *compptr +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + global EXTN(jsimd_idct_ifast_sse2) + +EXTN(jsimd_idct_ifast_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2 + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm1,xmm0 + packsswb xmm1,xmm1 + packsswb xmm1,xmm1 + movd eax,xmm1 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07) + + pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) + pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) + pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) + pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) + pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) + pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) + pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) + pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 + jmp near .column_end + alignx 16,7 +%endif +.columnDCT: + + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4,xmm0 + movdqa xmm5,xmm1 + psubw xmm0,xmm2 ; xmm0=tmp11 + psubw xmm1,xmm3 + paddw xmm4,xmm2 ; xmm4=tmp10 + paddw xmm5,xmm3 ; xmm5=tmp13 + + psllw xmm1,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm1,[GOTOFF(ebx,PW_F1414)] + psubw xmm1,xmm5 ; xmm1=tmp12 + + movdqa xmm6,xmm4 + movdqa xmm7,xmm0 + psubw xmm4,xmm5 ; xmm4=tmp3 + psubw xmm0,xmm1 ; xmm0=tmp2 + paddw xmm6,xmm5 ; xmm6=tmp0 + paddw xmm7,xmm1 ; xmm7=tmp1 + + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 + + ; -- Odd part + + movdqa xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4,xmm2 + movdqa xmm0,xmm5 + psubw xmm2,xmm1 ; xmm2=z12 + psubw xmm5,xmm3 ; xmm5=z10 + paddw xmm4,xmm1 ; xmm4=z11 + paddw xmm0,xmm3 ; xmm0=z13 + + movdqa xmm1,xmm5 ; xmm1=z10(unscaled) + psllw xmm2,PRE_MULTIPLY_SCALE_BITS + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + + movdqa xmm3,xmm4 + psubw xmm4,xmm0 + paddw xmm3,xmm0 ; xmm3=tmp7 + + psllw xmm4,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm4,[GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm0,xmm5 + paddw xmm5,xmm2 + pmulhw xmm5,[GOTOFF(ebx,PW_F1847)] ; xmm5=z5 + pmulhw xmm0,[GOTOFF(ebx,PW_MF1613)] + pmulhw xmm2,[GOTOFF(ebx,PW_F1082)] + psubw xmm0,xmm1 + psubw xmm2,xmm5 ; xmm2=tmp10 + paddw xmm0,xmm5 ; xmm0=tmp12 + + ; -- Final output stage + + psubw xmm0,xmm3 ; xmm0=tmp6 + movdqa xmm1,xmm6 + movdqa xmm5,xmm7 + paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) + paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) + psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) + psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) + psubw xmm4,xmm0 ; xmm4=tmp5 + + movdqa xmm3,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13) + punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17) + movdqa xmm0,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73) + punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) + + paddw xmm2,xmm4 ; xmm2=tmp4 + movdqa xmm5,xmm7 + movdqa xmm0,xmm1 + paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) + paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) + psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) + psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm4,xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33) + punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm2,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53) + punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57) + + movdqa xmm0,xmm3 ; transpose coefficients(phase 2) + punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35) + punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37) + movdqa xmm5,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31) + punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33) + + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) + + movdqa xmm3,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71) + punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73) + movdqa xmm0,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77) + + movdqa xmm4,xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) + movdqa xmm7,xmm5 ; transpose coefficients(phase 3) + punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 + + movdqa xmm4,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) + movdqa xmm7,xmm3 ; transpose coefficients(phase 3) + punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + ; -- Even part + + ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 + + movdqa xmm2,xmm6 + movdqa xmm0,xmm5 + psubw xmm6,xmm1 ; xmm6=tmp11 + psubw xmm5,xmm3 + paddw xmm2,xmm1 ; xmm2=tmp10 + paddw xmm0,xmm3 ; xmm0=tmp13 + + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5,[GOTOFF(ebx,PW_F1414)] + psubw xmm5,xmm0 ; xmm5=tmp12 + + movdqa xmm1,xmm2 + movdqa xmm3,xmm6 + psubw xmm2,xmm0 ; xmm2=tmp3 + psubw xmm6,xmm5 ; xmm6=tmp2 + paddw xmm1,xmm0 ; xmm1=tmp0 + paddw xmm3,xmm5 ; xmm3=tmp1 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 + + ; -- Odd part + + ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 + + movdqa xmm2,xmm0 + movdqa xmm6,xmm4 + psubw xmm0,xmm7 ; xmm0=z12 + psubw xmm4,xmm5 ; xmm4=z10 + paddw xmm2,xmm7 ; xmm2=z11 + paddw xmm6,xmm5 ; xmm6=z13 + + movdqa xmm7,xmm4 ; xmm7=z10(unscaled) + psllw xmm0,PRE_MULTIPLY_SCALE_BITS + psllw xmm4,PRE_MULTIPLY_SCALE_BITS + + movdqa xmm5,xmm2 + psubw xmm2,xmm6 + paddw xmm5,xmm6 ; xmm5=tmp7 + + psllw xmm2,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm2,[GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm6,xmm4 + paddw xmm4,xmm0 + pmulhw xmm4,[GOTOFF(ebx,PW_F1847)] ; xmm4=z5 + pmulhw xmm6,[GOTOFF(ebx,PW_MF1613)] + pmulhw xmm0,[GOTOFF(ebx,PW_F1082)] + psubw xmm6,xmm7 + psubw xmm0,xmm4 ; xmm0=tmp10 + paddw xmm6,xmm4 ; xmm6=tmp12 + + ; -- Final output stage + + psubw xmm6,xmm5 ; xmm6=tmp6 + movdqa xmm7,xmm1 + movdqa xmm4,xmm3 + paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) + paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) + psraw xmm1,(PASS1_BITS+3) ; descale + psraw xmm3,(PASS1_BITS+3) ; descale + psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) + psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) + psraw xmm7,(PASS1_BITS+3) ; descale + psraw xmm4,(PASS1_BITS+3) ; descale + psubw xmm2,xmm6 ; xmm2=tmp5 + + packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 + + paddw xmm0,xmm2 ; xmm0=tmp4 + movdqa xmm4,xmm5 + movdqa xmm7,xmm6 + paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) + paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) + psraw xmm5,(PASS1_BITS+3) ; descale + psraw xmm6,(PASS1_BITS+3) ; descale + psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) + psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) + psraw xmm4,(PASS1_BITS+3) ; descale + psraw xmm7,(PASS1_BITS+3) ; descale + + movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] + + packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm1,xmm2 + paddb xmm3,xmm2 + paddb xmm5,xmm2 + paddb xmm7,xmm2 + + movdqa xmm0,xmm1 ; transpose coefficients(phase 1) + punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm6,xmm5 ; transpose coefficients(phase 1) + punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4,xmm1 ; transpose coefficients(phase 2) + punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm2,xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm3,xmm1 ; transpose coefficients(phase 3) + punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm7,xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 + mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7 + + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 + mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jidctint-altivec.c b/media/libjpeg/simd/jidctint-altivec.c new file mode 100644 index 000000000..935f35d1e --- /dev/null +++ b/media/libjpeg/simd/jidctint-altivec.c @@ -0,0 +1,359 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* SLOW INTEGER INVERSE DCT */ + +#include "jsimd_altivec.h" + + +#define F_0_298 2446 /* FIX(0.298631336) */ +#define F_0_390 3196 /* FIX(0.390180644) */ +#define F_0_541 4433 /* FIX(0.541196100) */ +#define F_0_765 6270 /* FIX(0.765366865) */ +#define F_0_899 7373 /* FIX(0.899976223) */ +#define F_1_175 9633 /* FIX(1.175875602) */ +#define F_1_501 12299 /* FIX(1.501321110) */ +#define F_1_847 15137 /* FIX(1.847759065) */ +#define F_1_961 16069 /* FIX(1.961570560) */ +#define F_2_053 16819 /* FIX(2.053119869) */ +#define F_2_562 20995 /* FIX(2.562915447) */ +#define F_3_072 25172 /* FIX(3.072711026) */ + +#define CONST_BITS 13 +#define PASS1_BITS 2 +#define DESCALE_P1 (CONST_BITS - PASS1_BITS) +#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) + + +#define DO_IDCT(in, PASS) \ +{ \ + /* Even part \ + * \ + * (Original) \ + * z1 = (z2 + z3) * 0.541196100; \ + * tmp2 = z1 + z3 * -1.847759065; \ + * tmp3 = z1 + z2 * 0.765366865; \ + * \ + * (This implementation) \ + * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \ + * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \ + */ \ + \ + in##26l = vec_mergeh(in##2, in##6); \ + in##26h = vec_mergel(in##2, in##6); \ + \ + tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero); \ + tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero); \ + tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero); \ + tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero); \ + \ + tmp0 = vec_add(in##0, in##4); \ + tmp1 = vec_sub(in##0, in##4); \ + \ + tmp0l = vec_unpackh(tmp0); \ + tmp0h = vec_unpackl(tmp0); \ + tmp0l = vec_sl(tmp0l, const_bits); \ + tmp0h = vec_sl(tmp0h, const_bits); \ + tmp0l = vec_add(tmp0l, pd_descale_p##PASS); \ + tmp0h = vec_add(tmp0h, pd_descale_p##PASS); \ + \ + tmp10l = vec_add(tmp0l, tmp3l); \ + tmp10h = vec_add(tmp0h, tmp3h); \ + tmp13l = vec_sub(tmp0l, tmp3l); \ + tmp13h = vec_sub(tmp0h, tmp3h); \ + \ + tmp1l = vec_unpackh(tmp1); \ + tmp1h = vec_unpackl(tmp1); \ + tmp1l = vec_sl(tmp1l, const_bits); \ + tmp1h = vec_sl(tmp1h, const_bits); \ + tmp1l = vec_add(tmp1l, pd_descale_p##PASS); \ + tmp1h = vec_add(tmp1h, pd_descale_p##PASS); \ + \ + tmp11l = vec_add(tmp1l, tmp2l); \ + tmp11h = vec_add(tmp1h, tmp2h); \ + tmp12l = vec_sub(tmp1l, tmp2l); \ + tmp12h = vec_sub(tmp1h, tmp2h); \ + \ + /* Odd part */ \ + \ + z3 = vec_add(in##3, in##7); \ + z4 = vec_add(in##1, in##5); \ + \ + /* (Original) \ + * z5 = (z3 + z4) * 1.175875602; \ + * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \ + * z3 += z5; z4 += z5; \ + * \ + * (This implementation) \ + * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \ + * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \ + */ \ + \ + z34l = vec_mergeh(z3, z4); \ + z34h = vec_mergel(z3, z4); \ + \ + z3l = vec_msums(z34l, pw_mf078_f117, pd_zero); \ + z3h = vec_msums(z34h, pw_mf078_f117, pd_zero); \ + z4l = vec_msums(z34l, pw_f117_f078, pd_zero); \ + z4h = vec_msums(z34h, pw_f117_f078, pd_zero); \ + \ + /* (Original) \ + * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \ + * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \ + * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \ + * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \ + * tmp0 += z1 + z3; tmp1 += z2 + z4; \ + * tmp2 += z2 + z3; tmp3 += z1 + z4; \ + * \ + * (This implementation) \ + * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \ + * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \ + * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \ + * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \ + * tmp0 += z3; tmp1 += z4; \ + * tmp2 += z3; tmp3 += z4; \ + */ \ + \ + in##71l = vec_mergeh(in##7, in##1); \ + in##71h = vec_mergel(in##7, in##1); \ + \ + tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l); \ + tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h); \ + tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l); \ + tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h); \ + \ + in##53l = vec_mergeh(in##5, in##3); \ + in##53h = vec_mergel(in##5, in##3); \ + \ + tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l); \ + tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h); \ + tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l); \ + tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h); \ + \ + /* Final output stage */ \ + \ + out0l = vec_add(tmp10l, tmp3l); \ + out0h = vec_add(tmp10h, tmp3h); \ + out7l = vec_sub(tmp10l, tmp3l); \ + out7h = vec_sub(tmp10h, tmp3h); \ + \ + out0l = vec_sra(out0l, descale_p##PASS); \ + out0h = vec_sra(out0h, descale_p##PASS); \ + out7l = vec_sra(out7l, descale_p##PASS); \ + out7h = vec_sra(out7h, descale_p##PASS); \ + \ + out0 = vec_pack(out0l, out0h); \ + out7 = vec_pack(out7l, out7h); \ + \ + out1l = vec_add(tmp11l, tmp2l); \ + out1h = vec_add(tmp11h, tmp2h); \ + out6l = vec_sub(tmp11l, tmp2l); \ + out6h = vec_sub(tmp11h, tmp2h); \ + \ + out1l = vec_sra(out1l, descale_p##PASS); \ + out1h = vec_sra(out1h, descale_p##PASS); \ + out6l = vec_sra(out6l, descale_p##PASS); \ + out6h = vec_sra(out6h, descale_p##PASS); \ + \ + out1 = vec_pack(out1l, out1h); \ + out6 = vec_pack(out6l, out6h); \ + \ + out2l = vec_add(tmp12l, tmp1l); \ + out2h = vec_add(tmp12h, tmp1h); \ + out5l = vec_sub(tmp12l, tmp1l); \ + out5h = vec_sub(tmp12h, tmp1h); \ + \ + out2l = vec_sra(out2l, descale_p##PASS); \ + out2h = vec_sra(out2h, descale_p##PASS); \ + out5l = vec_sra(out5l, descale_p##PASS); \ + out5h = vec_sra(out5h, descale_p##PASS); \ + \ + out2 = vec_pack(out2l, out2h); \ + out5 = vec_pack(out5l, out5h); \ + \ + out3l = vec_add(tmp13l, tmp0l); \ + out3h = vec_add(tmp13h, tmp0h); \ + out4l = vec_sub(tmp13l, tmp0l); \ + out4h = vec_sub(tmp13h, tmp0h); \ + \ + out3l = vec_sra(out3l, descale_p##PASS); \ + out3h = vec_sra(out3h, descale_p##PASS); \ + out4l = vec_sra(out4l, descale_p##PASS); \ + out4h = vec_sra(out4h, descale_p##PASS); \ + \ + out3 = vec_pack(out3l, out3h); \ + out4 = vec_pack(out4l, out4h); \ +} + + +void +jsimd_idct_islow_altivec (void *dct_table_, JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + short *dct_table = (short *)dct_table_; + int *outptr; + + __vector short row0, row1, row2, row3, row4, row5, row6, row7, + col0, col1, col2, col3, col4, col5, col6, col7, + quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7, + tmp0, tmp1, tmp2, tmp3, z3, z4, + z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h, + row71l, row71h, row26l, row26h, row53l, row53h, + out0, out1, out2, out3, out4, out5, out6, out7; + __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h, + tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h, + z3l, z3h, z4l, z4h, + out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h, + out5l, out5h, out6l, out6h, out7l, out7h; + __vector signed char outb; + + /* Constants */ + __vector short pw_zero = { __8X(0) }, + pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) }, + pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) }, + pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) }, + pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) }, + pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) }, + pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) }, + pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) }, + pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) }; + __vector unsigned short pass1_bits = { __8X(PASS1_BITS) }; + __vector int pd_zero = { __4X(0) }, + pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) }, + pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) }; + __vector unsigned int descale_p1 = { __4X(DESCALE_P1) }, + descale_p2 = { __4X(DESCALE_P2) }, + const_bits = { __4X(CONST_BITS) }; + __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) }; + + /* Pass 1: process columns */ + + col0 = vec_ld(0, coef_block); + col1 = vec_ld(16, coef_block); + col2 = vec_ld(32, coef_block); + col3 = vec_ld(48, coef_block); + col4 = vec_ld(64, coef_block); + col5 = vec_ld(80, coef_block); + col6 = vec_ld(96, coef_block); + col7 = vec_ld(112, coef_block); + + tmp1 = vec_or(col1, col2); + tmp2 = vec_or(col3, col4); + tmp1 = vec_or(tmp1, tmp2); + tmp3 = vec_or(col5, col6); + tmp3 = vec_or(tmp3, col7); + tmp1 = vec_or(tmp1, tmp3); + + quant0 = vec_ld(0, dct_table); + col0 = vec_mladd(col0, quant0, pw_zero); + + if (vec_all_eq(tmp1, pw_zero)) { + /* AC terms all zero */ + + col0 = vec_sl(col0, pass1_bits); + + row0 = vec_splat(col0, 0); + row1 = vec_splat(col0, 1); + row2 = vec_splat(col0, 2); + row3 = vec_splat(col0, 3); + row4 = vec_splat(col0, 4); + row5 = vec_splat(col0, 5); + row6 = vec_splat(col0, 6); + row7 = vec_splat(col0, 7); + + } else { + + quant1 = vec_ld(16, dct_table); + quant2 = vec_ld(32, dct_table); + quant3 = vec_ld(48, dct_table); + quant4 = vec_ld(64, dct_table); + quant5 = vec_ld(80, dct_table); + quant6 = vec_ld(96, dct_table); + quant7 = vec_ld(112, dct_table); + + col1 = vec_mladd(col1, quant1, pw_zero); + col2 = vec_mladd(col2, quant2, pw_zero); + col3 = vec_mladd(col3, quant3, pw_zero); + col4 = vec_mladd(col4, quant4, pw_zero); + col5 = vec_mladd(col5, quant5, pw_zero); + col6 = vec_mladd(col6, quant6, pw_zero); + col7 = vec_mladd(col7, quant7, pw_zero); + + DO_IDCT(col, 1); + + TRANSPOSE(out, row); + } + + /* Pass 2: process rows */ + + DO_IDCT(row, 2); + + TRANSPOSE(out, col); + + outb = vec_packs(col0, col0); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[0] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col1, col1); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[1] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col2, col2); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[2] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col3, col3); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[3] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col4, col4); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[4] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col5, col5); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[5] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col6, col6); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[6] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col7, col7); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[7] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); +} diff --git a/media/libjpeg/simd/jidctint-mmx.asm b/media/libjpeg/simd/jidctint-mmx.asm new file mode 100644 index 000000000..5bd198120 --- /dev/null +++ b/media/libjpeg/simd/jidctint-mmx.asm @@ -0,0 +1,851 @@ +; +; jidctint.asm - accurate integer IDCT (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slow-but-accurate integer implementation of the +; inverse DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jidctint.c; see the jidctint.c for +; more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS-PASS1_BITS) +%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_idct_islow_mmx) + +EXTN(jconst_idct_islow_mmx): + +PW_F130_F054 times 2 dw (F_0_541+F_0_765), F_0_541 +PW_F054_MF130 times 2 dw F_0_541, (F_0_541-F_1_847) +PW_MF078_F117 times 2 dw (F_1_175-F_1_961), F_1_175 +PW_F117_F078 times 2 dw F_1_175, (F_1_175-F_0_390) +PW_MF060_MF089 times 2 dw (F_0_298-F_0_899),-F_0_899 +PW_MF089_F060 times 2 dw -F_0_899, (F_1_501-F_0_899) +PW_MF050_MF256 times 2 dw (F_2_053-F_2_562),-F_2_562 +PW_MF256_F050 times 2 dw -F_2_562, (F_3_072-F_2_562) +PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1-1) +PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2-1) +PB_CENTERJSAMP times 8 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_islow_mmx (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b)+8 ; jpeg_component_info *compptr +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 12 +%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF + ; JCOEF workspace[DCTSIZE2] + + align 16 + global EXTN(jsimd_idct_islow_mmx) + +EXTN(jsimd_idct_islow_mmx): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; JCOEF *wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por mm1,mm0 + packsswb mm1,mm1 + movd eax,mm1 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw mm0,PASS1_BITS + + movq mm2,mm0 ; mm0=in0=(00 01 02 03) + punpcklwd mm0,mm0 ; mm0=(00 00 01 01) + punpckhwd mm2,mm2 ; mm2=(02 02 03 03) + + movq mm1,mm0 + punpckldq mm0,mm0 ; mm0=(00 00 00 00) + punpckhdq mm1,mm1 ; mm1=(01 01 01 01) + movq mm3,mm2 + punpckldq mm2,mm2 ; mm2=(02 02 02 02) + punpckhdq mm3,mm3 ; mm3=(03 03 03 03) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3 + jmp near .nextcolumn + alignx 16,7 +%endif +.columnDCT: + + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movq mm4,mm1 ; mm1=in2=z2 + movq mm5,mm1 + punpcklwd mm4,mm3 ; mm3=in6=z3 + punpckhwd mm5,mm3 + movq mm1,mm4 + movq mm3,mm5 + pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L + pmaddwd mm5,[GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H + pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L + pmaddwd mm3,[GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H + + movq mm6,mm0 + paddw mm0,mm2 ; mm0=in0+in4 + psubw mm6,mm2 ; mm6=in0-in4 + + pxor mm7,mm7 + pxor mm2,mm2 + punpcklwd mm7,mm0 ; mm7=tmp0L + punpckhwd mm2,mm0 ; mm2=tmp0H + psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS + psrad mm2,(16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS + + movq mm0,mm7 + paddd mm7,mm4 ; mm7=tmp10L + psubd mm0,mm4 ; mm0=tmp13L + movq mm4,mm2 + paddd mm2,mm5 ; mm2=tmp10H + psubd mm4,mm5 ; mm4=tmp13H + + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L + movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H + movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L + movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H + + pxor mm5,mm5 + pxor mm7,mm7 + punpcklwd mm5,mm6 ; mm5=tmp1L + punpckhwd mm7,mm6 ; mm7=tmp1H + psrad mm5,(16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS + psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS + + movq mm2,mm5 + paddd mm5,mm1 ; mm5=tmp11L + psubd mm2,mm1 ; mm2=tmp12L + movq mm0,mm7 + paddd mm7,mm3 ; mm7=tmp11H + psubd mm0,mm3 ; mm0=tmp12H + + movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L + movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H + movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L + movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H + + ; -- Odd part + + movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movq mm5,mm6 + movq mm7,mm4 + paddw mm5,mm3 ; mm5=z3 + paddw mm7,mm1 ; mm7=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movq mm2,mm5 + movq mm0,mm5 + punpcklwd mm2,mm7 + punpckhwd mm0,mm7 + movq mm5,mm2 + movq mm7,mm0 + pmaddwd mm2,[GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L + pmaddwd mm0,[GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H + pmaddwd mm5,[GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L + pmaddwd mm7,[GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H + + movq MMWORD [wk(10)], mm2 ; wk(10)=z3L + movq MMWORD [wk(11)], mm0 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movq mm2,mm3 + movq mm0,mm3 + punpcklwd mm2,mm4 + punpckhwd mm0,mm4 + movq mm3,mm2 + movq mm4,mm0 + pmaddwd mm2,[GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L + pmaddwd mm0,[GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H + pmaddwd mm3,[GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L + pmaddwd mm4,[GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H + + paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L + paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H + paddd mm3,mm5 ; mm3=tmp3L + paddd mm4,mm7 ; mm4=tmp3H + + movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L + movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H + + movq mm2,mm1 + movq mm0,mm1 + punpcklwd mm2,mm6 + punpckhwd mm0,mm6 + movq mm1,mm2 + movq mm6,mm0 + pmaddwd mm2,[GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L + pmaddwd mm0,[GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H + pmaddwd mm1,[GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L + pmaddwd mm6,[GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H + + paddd mm2,mm5 ; mm2=tmp1L + paddd mm0,mm7 ; mm0=tmp1H + paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L + paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H + + movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L + movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H + + ; -- Final output stage + + movq mm5, MMWORD [wk(0)] ; mm5=tmp10L + movq mm7, MMWORD [wk(1)] ; mm7=tmp10H + + movq mm2,mm5 + movq mm0,mm7 + paddd mm5,mm3 ; mm5=data0L + paddd mm7,mm4 ; mm7=data0H + psubd mm2,mm3 ; mm2=data7L + psubd mm0,mm4 ; mm0=data7H + + movq mm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm3=[PD_DESCALE_P1] + + paddd mm5,mm3 + paddd mm7,mm3 + psrad mm5,DESCALE_P1 + psrad mm7,DESCALE_P1 + paddd mm2,mm3 + paddd mm0,mm3 + psrad mm2,DESCALE_P1 + psrad mm0,DESCALE_P1 + + packssdw mm5,mm7 ; mm5=data0=(00 01 02 03) + packssdw mm2,mm0 ; mm2=data7=(70 71 72 73) + + movq mm4, MMWORD [wk(4)] ; mm4=tmp11L + movq mm3, MMWORD [wk(5)] ; mm3=tmp11H + + movq mm7,mm4 + movq mm0,mm3 + paddd mm4,mm1 ; mm4=data1L + paddd mm3,mm6 ; mm3=data1H + psubd mm7,mm1 ; mm7=data6L + psubd mm0,mm6 ; mm0=data6H + + movq mm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm1=[PD_DESCALE_P1] + + paddd mm4,mm1 + paddd mm3,mm1 + psrad mm4,DESCALE_P1 + psrad mm3,DESCALE_P1 + paddd mm7,mm1 + paddd mm0,mm1 + psrad mm7,DESCALE_P1 + psrad mm0,DESCALE_P1 + + packssdw mm4,mm3 ; mm4=data1=(10 11 12 13) + packssdw mm7,mm0 ; mm7=data6=(60 61 62 63) + + movq mm6,mm5 ; transpose coefficients(phase 1) + punpcklwd mm5,mm4 ; mm5=(00 10 01 11) + punpckhwd mm6,mm4 ; mm6=(02 12 03 13) + movq mm1,mm7 ; transpose coefficients(phase 1) + punpcklwd mm7,mm2 ; mm7=(60 70 61 71) + punpckhwd mm1,mm2 ; mm1=(62 72 63 73) + + movq mm3, MMWORD [wk(6)] ; mm3=tmp12L + movq mm0, MMWORD [wk(7)] ; mm0=tmp12H + movq mm4, MMWORD [wk(10)] ; mm4=tmp1L + movq mm2, MMWORD [wk(11)] ; mm2=tmp1H + + movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 01 11) + movq MMWORD [wk(1)], mm6 ; wk(1)=(02 12 03 13) + movq MMWORD [wk(4)], mm7 ; wk(4)=(60 70 61 71) + movq MMWORD [wk(5)], mm1 ; wk(5)=(62 72 63 73) + + movq mm5,mm3 + movq mm6,mm0 + paddd mm3,mm4 ; mm3=data2L + paddd mm0,mm2 ; mm0=data2H + psubd mm5,mm4 ; mm5=data5L + psubd mm6,mm2 ; mm6=data5H + + movq mm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm7=[PD_DESCALE_P1] + + paddd mm3,mm7 + paddd mm0,mm7 + psrad mm3,DESCALE_P1 + psrad mm0,DESCALE_P1 + paddd mm5,mm7 + paddd mm6,mm7 + psrad mm5,DESCALE_P1 + psrad mm6,DESCALE_P1 + + packssdw mm3,mm0 ; mm3=data2=(20 21 22 23) + packssdw mm5,mm6 ; mm5=data5=(50 51 52 53) + + movq mm1, MMWORD [wk(2)] ; mm1=tmp13L + movq mm4, MMWORD [wk(3)] ; mm4=tmp13H + movq mm2, MMWORD [wk(8)] ; mm2=tmp0L + movq mm7, MMWORD [wk(9)] ; mm7=tmp0H + + movq mm0,mm1 + movq mm6,mm4 + paddd mm1,mm2 ; mm1=data3L + paddd mm4,mm7 ; mm4=data3H + psubd mm0,mm2 ; mm0=data4L + psubd mm6,mm7 ; mm6=data4H + + movq mm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm2=[PD_DESCALE_P1] + + paddd mm1,mm2 + paddd mm4,mm2 + psrad mm1,DESCALE_P1 + psrad mm4,DESCALE_P1 + paddd mm0,mm2 + paddd mm6,mm2 + psrad mm0,DESCALE_P1 + psrad mm6,DESCALE_P1 + + packssdw mm1,mm4 ; mm1=data3=(30 31 32 33) + packssdw mm0,mm6 ; mm0=data4=(40 41 42 43) + + movq mm7, MMWORD [wk(0)] ; mm7=(00 10 01 11) + movq mm2, MMWORD [wk(1)] ; mm2=(02 12 03 13) + + movq mm4,mm3 ; transpose coefficients(phase 1) + punpcklwd mm3,mm1 ; mm3=(20 30 21 31) + punpckhwd mm4,mm1 ; mm4=(22 32 23 33) + movq mm6,mm0 ; transpose coefficients(phase 1) + punpcklwd mm0,mm5 ; mm0=(40 50 41 51) + punpckhwd mm6,mm5 ; mm6=(42 52 43 53) + + movq mm1,mm7 ; transpose coefficients(phase 2) + punpckldq mm7,mm3 ; mm7=(00 10 20 30) + punpckhdq mm1,mm3 ; mm1=(01 11 21 31) + movq mm5,mm2 ; transpose coefficients(phase 2) + punpckldq mm2,mm4 ; mm2=(02 12 22 32) + punpckhdq mm5,mm4 ; mm5=(03 13 23 33) + + movq mm3, MMWORD [wk(4)] ; mm3=(60 70 61 71) + movq mm4, MMWORD [wk(5)] ; mm4=(62 72 63 73) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5 + + movq mm7,mm0 ; transpose coefficients(phase 2) + punpckldq mm0,mm3 ; mm0=(40 50 60 70) + punpckhdq mm7,mm3 ; mm7=(41 51 61 71) + movq mm1,mm6 ; transpose coefficients(phase 2) + punpckldq mm6,mm4 ; mm6=(42 52 62 72) + punpckhdq mm1,mm4 ; mm1=(43 53 63 73) + + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1 + +.nextcolumn: + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr + add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; JCOEF *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 +.rowloop: + + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movq mm4,mm1 ; mm1=in2=z2 + movq mm5,mm1 + punpcklwd mm4,mm3 ; mm3=in6=z3 + punpckhwd mm5,mm3 + movq mm1,mm4 + movq mm3,mm5 + pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L + pmaddwd mm5,[GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H + pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L + pmaddwd mm3,[GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H + + movq mm6,mm0 + paddw mm0,mm2 ; mm0=in0+in4 + psubw mm6,mm2 ; mm6=in0-in4 + + pxor mm7,mm7 + pxor mm2,mm2 + punpcklwd mm7,mm0 ; mm7=tmp0L + punpckhwd mm2,mm0 ; mm2=tmp0H + psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS + psrad mm2,(16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS + + movq mm0,mm7 + paddd mm7,mm4 ; mm7=tmp10L + psubd mm0,mm4 ; mm0=tmp13L + movq mm4,mm2 + paddd mm2,mm5 ; mm2=tmp10H + psubd mm4,mm5 ; mm4=tmp13H + + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L + movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H + movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L + movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H + + pxor mm5,mm5 + pxor mm7,mm7 + punpcklwd mm5,mm6 ; mm5=tmp1L + punpckhwd mm7,mm6 ; mm7=tmp1H + psrad mm5,(16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS + psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS + + movq mm2,mm5 + paddd mm5,mm1 ; mm5=tmp11L + psubd mm2,mm1 ; mm2=tmp12L + movq mm0,mm7 + paddd mm7,mm3 ; mm7=tmp11H + psubd mm0,mm3 ; mm0=tmp12H + + movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L + movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H + movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L + movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H + + ; -- Odd part + + movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + movq mm5,mm6 + movq mm7,mm4 + paddw mm5,mm3 ; mm5=z3 + paddw mm7,mm1 ; mm7=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movq mm2,mm5 + movq mm0,mm5 + punpcklwd mm2,mm7 + punpckhwd mm0,mm7 + movq mm5,mm2 + movq mm7,mm0 + pmaddwd mm2,[GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L + pmaddwd mm0,[GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H + pmaddwd mm5,[GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L + pmaddwd mm7,[GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H + + movq MMWORD [wk(10)], mm2 ; wk(10)=z3L + movq MMWORD [wk(11)], mm0 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movq mm2,mm3 + movq mm0,mm3 + punpcklwd mm2,mm4 + punpckhwd mm0,mm4 + movq mm3,mm2 + movq mm4,mm0 + pmaddwd mm2,[GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L + pmaddwd mm0,[GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H + pmaddwd mm3,[GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L + pmaddwd mm4,[GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H + + paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L + paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H + paddd mm3,mm5 ; mm3=tmp3L + paddd mm4,mm7 ; mm4=tmp3H + + movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L + movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H + + movq mm2,mm1 + movq mm0,mm1 + punpcklwd mm2,mm6 + punpckhwd mm0,mm6 + movq mm1,mm2 + movq mm6,mm0 + pmaddwd mm2,[GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L + pmaddwd mm0,[GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H + pmaddwd mm1,[GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L + pmaddwd mm6,[GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H + + paddd mm2,mm5 ; mm2=tmp1L + paddd mm0,mm7 ; mm0=tmp1H + paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L + paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H + + movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L + movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H + + ; -- Final output stage + + movq mm5, MMWORD [wk(0)] ; mm5=tmp10L + movq mm7, MMWORD [wk(1)] ; mm7=tmp10H + + movq mm2,mm5 + movq mm0,mm7 + paddd mm5,mm3 ; mm5=data0L + paddd mm7,mm4 ; mm7=data0H + psubd mm2,mm3 ; mm2=data7L + psubd mm0,mm4 ; mm0=data7H + + movq mm3,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm3=[PD_DESCALE_P2] + + paddd mm5,mm3 + paddd mm7,mm3 + psrad mm5,DESCALE_P2 + psrad mm7,DESCALE_P2 + paddd mm2,mm3 + paddd mm0,mm3 + psrad mm2,DESCALE_P2 + psrad mm0,DESCALE_P2 + + packssdw mm5,mm7 ; mm5=data0=(00 10 20 30) + packssdw mm2,mm0 ; mm2=data7=(07 17 27 37) + + movq mm4, MMWORD [wk(4)] ; mm4=tmp11L + movq mm3, MMWORD [wk(5)] ; mm3=tmp11H + + movq mm7,mm4 + movq mm0,mm3 + paddd mm4,mm1 ; mm4=data1L + paddd mm3,mm6 ; mm3=data1H + psubd mm7,mm1 ; mm7=data6L + psubd mm0,mm6 ; mm0=data6H + + movq mm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm1=[PD_DESCALE_P2] + + paddd mm4,mm1 + paddd mm3,mm1 + psrad mm4,DESCALE_P2 + psrad mm3,DESCALE_P2 + paddd mm7,mm1 + paddd mm0,mm1 + psrad mm7,DESCALE_P2 + psrad mm0,DESCALE_P2 + + packssdw mm4,mm3 ; mm4=data1=(01 11 21 31) + packssdw mm7,mm0 ; mm7=data6=(06 16 26 36) + + packsswb mm5,mm7 ; mm5=(00 10 20 30 06 16 26 36) + packsswb mm4,mm2 ; mm4=(01 11 21 31 07 17 27 37) + + movq mm6, MMWORD [wk(6)] ; mm6=tmp12L + movq mm1, MMWORD [wk(7)] ; mm1=tmp12H + movq mm3, MMWORD [wk(10)] ; mm3=tmp1L + movq mm0, MMWORD [wk(11)] ; mm0=tmp1H + + movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 20 30 06 16 26 36) + movq MMWORD [wk(1)], mm4 ; wk(1)=(01 11 21 31 07 17 27 37) + + movq mm7,mm6 + movq mm2,mm1 + paddd mm6,mm3 ; mm6=data2L + paddd mm1,mm0 ; mm1=data2H + psubd mm7,mm3 ; mm7=data5L + psubd mm2,mm0 ; mm2=data5H + + movq mm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm5=[PD_DESCALE_P2] + + paddd mm6,mm5 + paddd mm1,mm5 + psrad mm6,DESCALE_P2 + psrad mm1,DESCALE_P2 + paddd mm7,mm5 + paddd mm2,mm5 + psrad mm7,DESCALE_P2 + psrad mm2,DESCALE_P2 + + packssdw mm6,mm1 ; mm6=data2=(02 12 22 32) + packssdw mm7,mm2 ; mm7=data5=(05 15 25 35) + + movq mm4, MMWORD [wk(2)] ; mm4=tmp13L + movq mm3, MMWORD [wk(3)] ; mm3=tmp13H + movq mm0, MMWORD [wk(8)] ; mm0=tmp0L + movq mm5, MMWORD [wk(9)] ; mm5=tmp0H + + movq mm1,mm4 + movq mm2,mm3 + paddd mm4,mm0 ; mm4=data3L + paddd mm3,mm5 ; mm3=data3H + psubd mm1,mm0 ; mm1=data4L + psubd mm2,mm5 ; mm2=data4H + + movq mm0,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm0=[PD_DESCALE_P2] + + paddd mm4,mm0 + paddd mm3,mm0 + psrad mm4,DESCALE_P2 + psrad mm3,DESCALE_P2 + paddd mm1,mm0 + paddd mm2,mm0 + psrad mm1,DESCALE_P2 + psrad mm2,DESCALE_P2 + + movq mm5,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm5=[PB_CENTERJSAMP] + + packssdw mm4,mm3 ; mm4=data3=(03 13 23 33) + packssdw mm1,mm2 ; mm1=data4=(04 14 24 34) + + movq mm0, MMWORD [wk(0)] ; mm0=(00 10 20 30 06 16 26 36) + movq mm3, MMWORD [wk(1)] ; mm3=(01 11 21 31 07 17 27 37) + + packsswb mm6,mm1 ; mm6=(02 12 22 32 04 14 24 34) + packsswb mm4,mm7 ; mm4=(03 13 23 33 05 15 25 35) + + paddb mm0,mm5 + paddb mm3,mm5 + paddb mm6,mm5 + paddb mm4,mm5 + + movq mm2,mm0 ; transpose coefficients(phase 1) + punpcklbw mm0,mm3 ; mm0=(00 01 10 11 20 21 30 31) + punpckhbw mm2,mm3 ; mm2=(06 07 16 17 26 27 36 37) + movq mm1,mm6 ; transpose coefficients(phase 1) + punpcklbw mm6,mm4 ; mm6=(02 03 12 13 22 23 32 33) + punpckhbw mm1,mm4 ; mm1=(04 05 14 15 24 25 34 35) + + movq mm7,mm0 ; transpose coefficients(phase 2) + punpcklwd mm0,mm6 ; mm0=(00 01 02 03 10 11 12 13) + punpckhwd mm7,mm6 ; mm7=(20 21 22 23 30 31 32 33) + movq mm5,mm1 ; transpose coefficients(phase 2) + punpcklwd mm1,mm2 ; mm1=(04 05 06 07 14 15 16 17) + punpckhwd mm5,mm2 ; mm5=(24 25 26 27 34 35 36 37) + + movq mm3,mm0 ; transpose coefficients(phase 3) + punpckldq mm0,mm1 ; mm0=(00 01 02 03 04 05 06 07) + punpckhdq mm3,mm1 ; mm3=(10 11 12 13 14 15 16 17) + movq mm4,mm7 ; transpose coefficients(phase 3) + punpckldq mm7,mm5 ; mm7=(20 21 22 23 24 25 26 27) + punpckhdq mm4,mm5 ; mm4=(30 31 32 33 34 35 36 37) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3 + mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4 + + poppic ebx ; restore GOT address + + add esi, byte 4*SIZEOF_JCOEF ; wsptr + add edi, byte 4*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jidctint-sse2-64.asm b/media/libjpeg/simd/jidctint-sse2-64.asm new file mode 100644 index 000000000..afe1d6a73 --- /dev/null +++ b/media/libjpeg/simd/jidctint-sse2-64.asm @@ -0,0 +1,847 @@ +; +; jidctint.asm - accurate integer IDCT (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slow-but-accurate integer implementation of the +; inverse DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jidctint.c; see the jidctint.c for +; more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS-PASS1_BITS) +%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_idct_islow_sse2) + +EXTN(jconst_idct_islow_sse2): + +PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 +PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) +PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 +PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) +PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 +PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) +PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 +PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) +PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) +PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_islow_sse2 (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +; r10 = jpeg_component_info *compptr +; r11 = JCOEFPTR coef_block +; r12 = JSAMPARRAY output_buf +; r13 = JDIMENSION output_col + +%define original_rbp rbp+0 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 12 + + align 16 + global EXTN(jsimd_idct_islow_sse2) + +EXTN(jsimd_idct_islow_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + + ; ---- Pass 1: process columns from input. + + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2 + mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + por xmm1,xmm0 + packsswb xmm1,xmm1 + packsswb xmm1,xmm1 + movd eax,xmm1 + test rax,rax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm5,PASS1_BITS + + movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03) + punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07) + + pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) + pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) + pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) + pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) + pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) + pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) + pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) + pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 + jmp near .column_end +%endif +.columnDCT: + + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm4,xmm1 ; xmm1=in2=z2 + movdqa xmm5,xmm1 + punpcklwd xmm4,xmm3 ; xmm3=in6=z3 + punpckhwd xmm5,xmm3 + movdqa xmm1,xmm4 + movdqa xmm3,xmm5 + pmaddwd xmm4,[rel PW_F130_F054] ; xmm4=tmp3L + pmaddwd xmm5,[rel PW_F130_F054] ; xmm5=tmp3H + pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=tmp2L + pmaddwd xmm3,[rel PW_F054_MF130] ; xmm3=tmp2H + + movdqa xmm6,xmm0 + paddw xmm0,xmm2 ; xmm0=in0+in4 + psubw xmm6,xmm2 ; xmm6=in0-in4 + + pxor xmm7,xmm7 + pxor xmm2,xmm2 + punpcklwd xmm7,xmm0 ; xmm7=tmp0L + punpckhwd xmm2,xmm0 ; xmm2=tmp0H + psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS + + movdqa xmm0,xmm7 + paddd xmm7,xmm4 ; xmm7=tmp10L + psubd xmm0,xmm4 ; xmm0=tmp13L + movdqa xmm4,xmm2 + paddd xmm2,xmm5 ; xmm2=tmp10H + psubd xmm4,xmm5 ; xmm4=tmp13H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H + + pxor xmm5,xmm5 + pxor xmm7,xmm7 + punpcklwd xmm5,xmm6 ; xmm5=tmp1L + punpckhwd xmm7,xmm6 ; xmm7=tmp1H + psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + + movdqa xmm2,xmm5 + paddd xmm5,xmm1 ; xmm5=tmp11L + psubd xmm2,xmm1 ; xmm2=tmp12L + movdqa xmm0,xmm7 + paddd xmm7,xmm3 ; xmm7=tmp11H + psubd xmm0,xmm3 ; xmm0=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + pmullw xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm5,xmm6 + movdqa xmm7,xmm4 + paddw xmm5,xmm3 ; xmm5=z3 + paddw xmm7,xmm1 ; xmm7=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm2,xmm5 + movdqa xmm0,xmm5 + punpcklwd xmm2,xmm7 + punpckhwd xmm0,xmm7 + movdqa xmm5,xmm2 + movdqa xmm7,xmm0 + pmaddwd xmm2,[rel PW_MF078_F117] ; xmm2=z3L + pmaddwd xmm0,[rel PW_MF078_F117] ; xmm0=z3H + pmaddwd xmm5,[rel PW_F117_F078] ; xmm5=z4L + pmaddwd xmm7,[rel PW_F117_F078] ; xmm7=z4H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm2,xmm3 + movdqa xmm0,xmm3 + punpcklwd xmm2,xmm4 + punpckhwd xmm0,xmm4 + movdqa xmm3,xmm2 + movdqa xmm4,xmm0 + pmaddwd xmm2,[rel PW_MF060_MF089] ; xmm2=tmp0L + pmaddwd xmm0,[rel PW_MF060_MF089] ; xmm0=tmp0H + pmaddwd xmm3,[rel PW_MF089_F060] ; xmm3=tmp3L + pmaddwd xmm4,[rel PW_MF089_F060] ; xmm4=tmp3H + + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L + paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H + paddd xmm3,xmm5 ; xmm3=tmp3L + paddd xmm4,xmm7 ; xmm4=tmp3H + + movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H + + movdqa xmm2,xmm1 + movdqa xmm0,xmm1 + punpcklwd xmm2,xmm6 + punpckhwd xmm0,xmm6 + movdqa xmm1,xmm2 + movdqa xmm6,xmm0 + pmaddwd xmm2,[rel PW_MF050_MF256] ; xmm2=tmp1L + pmaddwd xmm0,[rel PW_MF050_MF256] ; xmm0=tmp1H + pmaddwd xmm1,[rel PW_MF256_F050] ; xmm1=tmp2L + pmaddwd xmm6,[rel PW_MF256_F050] ; xmm6=tmp2H + + paddd xmm2,xmm5 ; xmm2=tmp1L + paddd xmm0,xmm7 ; xmm0=tmp1H + paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H + + movdqa xmm2,xmm5 + movdqa xmm0,xmm7 + paddd xmm5,xmm3 ; xmm5=data0L + paddd xmm7,xmm4 ; xmm7=data0H + psubd xmm2,xmm3 ; xmm2=data7L + psubd xmm0,xmm4 ; xmm0=data7H + + movdqa xmm3,[rel PD_DESCALE_P1] ; xmm3=[rel PD_DESCALE_P1] + + paddd xmm5,xmm3 + paddd xmm7,xmm3 + psrad xmm5,DESCALE_P1 + psrad xmm7,DESCALE_P1 + paddd xmm2,xmm3 + paddd xmm0,xmm3 + psrad xmm2,DESCALE_P1 + psrad xmm0,DESCALE_P1 + + packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) + packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) + + movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L + movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H + + movdqa xmm7,xmm4 + movdqa xmm0,xmm3 + paddd xmm4,xmm1 ; xmm4=data1L + paddd xmm3,xmm6 ; xmm3=data1H + psubd xmm7,xmm1 ; xmm7=data6L + psubd xmm0,xmm6 ; xmm0=data6H + + movdqa xmm1,[rel PD_DESCALE_P1] ; xmm1=[rel PD_DESCALE_P1] + + paddd xmm4,xmm1 + paddd xmm3,xmm1 + psrad xmm4,DESCALE_P1 + psrad xmm3,DESCALE_P1 + paddd xmm7,xmm1 + paddd xmm0,xmm1 + psrad xmm7,DESCALE_P1 + psrad xmm0,DESCALE_P1 + + packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) + + movdqa xmm6,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13) + punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm1,xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73) + punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77) + + movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L + movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H + movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L + movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17) + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) + + movdqa xmm5,xmm3 + movdqa xmm6,xmm0 + paddd xmm3,xmm4 ; xmm3=data2L + paddd xmm0,xmm2 ; xmm0=data2H + psubd xmm5,xmm4 ; xmm5=data5L + psubd xmm6,xmm2 ; xmm6=data5H + + movdqa xmm7,[rel PD_DESCALE_P1] ; xmm7=[rel PD_DESCALE_P1] + + paddd xmm3,xmm7 + paddd xmm0,xmm7 + psrad xmm3,DESCALE_P1 + psrad xmm0,DESCALE_P1 + paddd xmm5,xmm7 + paddd xmm6,xmm7 + psrad xmm5,DESCALE_P1 + psrad xmm6,DESCALE_P1 + + packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) + packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L + movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H + movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L + movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H + + movdqa xmm0,xmm1 + movdqa xmm6,xmm4 + paddd xmm1,xmm2 ; xmm1=data3L + paddd xmm4,xmm7 ; xmm4=data3H + psubd xmm0,xmm2 ; xmm0=data4L + psubd xmm6,xmm7 ; xmm6=data4H + + movdqa xmm2,[rel PD_DESCALE_P1] ; xmm2=[rel PD_DESCALE_P1] + + paddd xmm1,xmm2 + paddd xmm4,xmm2 + psrad xmm1,DESCALE_P1 + psrad xmm4,DESCALE_P1 + paddd xmm0,xmm2 + paddd xmm6,xmm2 + psrad xmm0,DESCALE_P1 + psrad xmm6,DESCALE_P1 + + packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) + packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) + movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) + + movdqa xmm4,xmm3 ; transpose coefficients(phase 1) + punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33) + punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm6,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53) + punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57) + + movdqa xmm1,xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31) + punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33) + movdqa xmm5,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35) + punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) + movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) + + movdqa xmm2,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71) + punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75) + punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77) + + movdqa xmm3,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) + movdqa xmm4,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 + + movdqa xmm3,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) + movdqa xmm4,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov rax, [original_rbp] + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d + + ; -- Even part + + ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6 + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm6,xmm1 ; xmm1=in2=z2 + movdqa xmm5,xmm1 + punpcklwd xmm6,xmm2 ; xmm2=in6=z3 + punpckhwd xmm5,xmm2 + movdqa xmm1,xmm6 + movdqa xmm2,xmm5 + pmaddwd xmm6,[rel PW_F130_F054] ; xmm6=tmp3L + pmaddwd xmm5,[rel PW_F130_F054] ; xmm5=tmp3H + pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=tmp2L + pmaddwd xmm2,[rel PW_F054_MF130] ; xmm2=tmp2H + + movdqa xmm3,xmm7 + paddw xmm7,xmm0 ; xmm7=in0+in4 + psubw xmm3,xmm0 ; xmm3=in0-in4 + + pxor xmm4,xmm4 + pxor xmm0,xmm0 + punpcklwd xmm4,xmm7 ; xmm4=tmp0L + punpckhwd xmm0,xmm7 ; xmm0=tmp0H + psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS + + movdqa xmm7,xmm4 + paddd xmm4,xmm6 ; xmm4=tmp10L + psubd xmm7,xmm6 ; xmm7=tmp13L + movdqa xmm6,xmm0 + paddd xmm0,xmm5 ; xmm0=tmp10H + psubd xmm6,xmm5 ; xmm6=tmp13H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H + + pxor xmm5,xmm5 + pxor xmm4,xmm4 + punpcklwd xmm5,xmm3 ; xmm5=tmp1L + punpckhwd xmm4,xmm3 ; xmm4=tmp1H + psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + + movdqa xmm0,xmm5 + paddd xmm5,xmm1 ; xmm5=tmp11L + psubd xmm0,xmm1 ; xmm0=tmp12L + movdqa xmm7,xmm4 + paddd xmm4,xmm2 ; xmm4=tmp11H + psubd xmm7,xmm2 ; xmm7=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3 + movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1 + movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 + movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 + + movdqa xmm5,xmm6 + movdqa xmm4,xmm3 + paddw xmm5,xmm1 ; xmm5=z3 + paddw xmm4,xmm2 ; xmm4=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm0,xmm5 + movdqa xmm7,xmm5 + punpcklwd xmm0,xmm4 + punpckhwd xmm7,xmm4 + movdqa xmm5,xmm0 + movdqa xmm4,xmm7 + pmaddwd xmm0,[rel PW_MF078_F117] ; xmm0=z3L + pmaddwd xmm7,[rel PW_MF078_F117] ; xmm7=z3H + pmaddwd xmm5,[rel PW_F117_F078] ; xmm5=z4L + pmaddwd xmm4,[rel PW_F117_F078] ; xmm4=z4H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm0,xmm1 + movdqa xmm7,xmm1 + punpcklwd xmm0,xmm3 + punpckhwd xmm7,xmm3 + movdqa xmm1,xmm0 + movdqa xmm3,xmm7 + pmaddwd xmm0,[rel PW_MF060_MF089] ; xmm0=tmp0L + pmaddwd xmm7,[rel PW_MF060_MF089] ; xmm7=tmp0H + pmaddwd xmm1,[rel PW_MF089_F060] ; xmm1=tmp3L + pmaddwd xmm3,[rel PW_MF089_F060] ; xmm3=tmp3H + + paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L + paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H + paddd xmm1,xmm5 ; xmm1=tmp3L + paddd xmm3,xmm4 ; xmm3=tmp3H + + movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H + + movdqa xmm0,xmm2 + movdqa xmm7,xmm2 + punpcklwd xmm0,xmm6 + punpckhwd xmm7,xmm6 + movdqa xmm2,xmm0 + movdqa xmm6,xmm7 + pmaddwd xmm0,[rel PW_MF050_MF256] ; xmm0=tmp1L + pmaddwd xmm7,[rel PW_MF050_MF256] ; xmm7=tmp1H + pmaddwd xmm2,[rel PW_MF256_F050] ; xmm2=tmp2L + pmaddwd xmm6,[rel PW_MF256_F050] ; xmm6=tmp2H + + paddd xmm0,xmm5 ; xmm0=tmp1L + paddd xmm7,xmm4 ; xmm7=tmp1H + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H + + movdqa xmm0,xmm5 + movdqa xmm7,xmm4 + paddd xmm5,xmm1 ; xmm5=data0L + paddd xmm4,xmm3 ; xmm4=data0H + psubd xmm0,xmm1 ; xmm0=data7L + psubd xmm7,xmm3 ; xmm7=data7H + + movdqa xmm1,[rel PD_DESCALE_P2] ; xmm1=[rel PD_DESCALE_P2] + + paddd xmm5,xmm1 + paddd xmm4,xmm1 + psrad xmm5,DESCALE_P2 + psrad xmm4,DESCALE_P2 + paddd xmm0,xmm1 + paddd xmm7,xmm1 + psrad xmm0,DESCALE_P2 + psrad xmm7,DESCALE_P2 + + packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) + packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L + movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H + + movdqa xmm4,xmm3 + movdqa xmm7,xmm1 + paddd xmm3,xmm2 ; xmm3=data1L + paddd xmm1,xmm6 ; xmm1=data1H + psubd xmm4,xmm2 ; xmm4=data6L + psubd xmm7,xmm6 ; xmm7=data6H + + movdqa xmm2,[rel PD_DESCALE_P2] ; xmm2=[rel PD_DESCALE_P2] + + paddd xmm3,xmm2 + paddd xmm1,xmm2 + psrad xmm3,DESCALE_P2 + psrad xmm1,DESCALE_P2 + paddd xmm4,xmm2 + paddd xmm7,xmm2 + psrad xmm4,DESCALE_P2 + psrad xmm7,DESCALE_P2 + + packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) + packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) + + packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H + movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L + movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm4,xmm6 + movdqa xmm0,xmm2 + paddd xmm6,xmm1 ; xmm6=data2L + paddd xmm2,xmm7 ; xmm2=data2H + psubd xmm4,xmm1 ; xmm4=data5L + psubd xmm0,xmm7 ; xmm0=data5H + + movdqa xmm5,[rel PD_DESCALE_P2] ; xmm5=[rel PD_DESCALE_P2] + + paddd xmm6,xmm5 + paddd xmm2,xmm5 + psrad xmm6,DESCALE_P2 + psrad xmm2,DESCALE_P2 + paddd xmm4,xmm5 + paddd xmm0,xmm5 + psrad xmm4,DESCALE_P2 + psrad xmm0,DESCALE_P2 + + packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) + packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) + + movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L + movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H + movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L + movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H + + movdqa xmm2,xmm3 + movdqa xmm0,xmm1 + paddd xmm3,xmm7 ; xmm3=data3L + paddd xmm1,xmm5 ; xmm1=data3H + psubd xmm2,xmm7 ; xmm2=data4L + psubd xmm0,xmm5 ; xmm0=data4H + + movdqa xmm7,[rel PD_DESCALE_P2] ; xmm7=[rel PD_DESCALE_P2] + + paddd xmm3,xmm7 + paddd xmm1,xmm7 + psrad xmm3,DESCALE_P2 + psrad xmm1,DESCALE_P2 + paddd xmm2,xmm7 + paddd xmm0,xmm7 + psrad xmm2,DESCALE_P2 + psrad xmm0,DESCALE_P2 + + movdqa xmm5,[rel PB_CENTERJSAMP] ; xmm5=[rel PB_CENTERJSAMP] + + packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) + packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm7,xmm5 + paddb xmm1,xmm5 + paddb xmm6,xmm5 + paddb xmm3,xmm5 + + movdqa xmm0,xmm7 ; transpose coefficients(phase 1) + punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm2,xmm6 ; transpose coefficients(phase 1) + punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4,xmm7 ; transpose coefficients(phase 2) + punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm5,xmm2 ; transpose coefficients(phase 2) + punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm1,xmm7 ; transpose coefficients(phase 3) + punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm3,xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1 + mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 + + mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0 + mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5 + + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jidctint-sse2.asm b/media/libjpeg/simd/jidctint-sse2.asm new file mode 100644 index 000000000..6c7e7d9b4 --- /dev/null +++ b/media/libjpeg/simd/jidctint-sse2.asm @@ -0,0 +1,858 @@ +; +; jidctint.asm - accurate integer IDCT (SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains a slow-but-accurate integer implementation of the +; inverse DCT (Discrete Cosine Transform). The following code is based +; directly on the IJG's original jidctint.c; see the jidctint.c for +; more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1 (CONST_BITS-PASS1_BITS) +%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3) + +%if CONST_BITS == 13 +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_idct_islow_sse2) + +EXTN(jconst_idct_islow_sse2): + +PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 +PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) +PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 +PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) +PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 +PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) +PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 +PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) +PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) +PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients. +; +; GLOBAL(void) +; jsimd_idct_islow_sse2 (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b)+8 ; jpeg_component_info *compptr +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 12 + + align 16 + global EXTN(jsimd_idct_islow_sse2) + +EXTN(jsimd_idct_islow_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2 + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm1,xmm0 + packsswb xmm1,xmm1 + packsswb xmm1,xmm1 + movd eax,xmm1 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm5,PASS1_BITS + + movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03) + punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07) + + pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) + pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) + pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) + pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) + pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) + pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) + pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) + pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 + jmp near .column_end + alignx 16,7 +%endif +.columnDCT: + + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm4,xmm1 ; xmm1=in2=z2 + movdqa xmm5,xmm1 + punpcklwd xmm4,xmm3 ; xmm3=in6=z3 + punpckhwd xmm5,xmm3 + movdqa xmm1,xmm4 + movdqa xmm3,xmm5 + pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=tmp3L + pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H + pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L + pmaddwd xmm3,[GOTOFF(ebx,PW_F054_MF130)] ; xmm3=tmp2H + + movdqa xmm6,xmm0 + paddw xmm0,xmm2 ; xmm0=in0+in4 + psubw xmm6,xmm2 ; xmm6=in0-in4 + + pxor xmm7,xmm7 + pxor xmm2,xmm2 + punpcklwd xmm7,xmm0 ; xmm7=tmp0L + punpckhwd xmm2,xmm0 ; xmm2=tmp0H + psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS + + movdqa xmm0,xmm7 + paddd xmm7,xmm4 ; xmm7=tmp10L + psubd xmm0,xmm4 ; xmm0=tmp13L + movdqa xmm4,xmm2 + paddd xmm2,xmm5 ; xmm2=tmp10H + psubd xmm4,xmm5 ; xmm4=tmp13H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H + + pxor xmm5,xmm5 + pxor xmm7,xmm7 + punpcklwd xmm5,xmm6 ; xmm5=tmp1L + punpckhwd xmm7,xmm6 ; xmm7=tmp1H + psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + + movdqa xmm2,xmm5 + paddd xmm5,xmm1 ; xmm5=tmp11L + psubd xmm2,xmm1 ; xmm2=tmp12L + movdqa xmm0,xmm7 + paddd xmm7,xmm3 ; xmm7=tmp11H + psubd xmm0,xmm3 ; xmm0=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm5,xmm6 + movdqa xmm7,xmm4 + paddw xmm5,xmm3 ; xmm5=z3 + paddw xmm7,xmm1 ; xmm7=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm2,xmm5 + movdqa xmm0,xmm5 + punpcklwd xmm2,xmm7 + punpckhwd xmm0,xmm7 + movdqa xmm5,xmm2 + movdqa xmm7,xmm0 + pmaddwd xmm2,[GOTOFF(ebx,PW_MF078_F117)] ; xmm2=z3L + pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3H + pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L + pmaddwd xmm7,[GOTOFF(ebx,PW_F117_F078)] ; xmm7=z4H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm2,xmm3 + movdqa xmm0,xmm3 + punpcklwd xmm2,xmm4 + punpckhwd xmm0,xmm4 + movdqa xmm3,xmm2 + movdqa xmm4,xmm0 + pmaddwd xmm2,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm2=tmp0L + pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0H + pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3L + pmaddwd xmm4,[GOTOFF(ebx,PW_MF089_F060)] ; xmm4=tmp3H + + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L + paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H + paddd xmm3,xmm5 ; xmm3=tmp3L + paddd xmm4,xmm7 ; xmm4=tmp3H + + movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H + + movdqa xmm2,xmm1 + movdqa xmm0,xmm1 + punpcklwd xmm2,xmm6 + punpckhwd xmm0,xmm6 + movdqa xmm1,xmm2 + movdqa xmm6,xmm0 + pmaddwd xmm2,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm2=tmp1L + pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1H + pmaddwd xmm1,[GOTOFF(ebx,PW_MF256_F050)] ; xmm1=tmp2L + pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H + + paddd xmm2,xmm5 ; xmm2=tmp1L + paddd xmm0,xmm7 ; xmm0=tmp1H + paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H + + movdqa xmm2,xmm5 + movdqa xmm0,xmm7 + paddd xmm5,xmm3 ; xmm5=data0L + paddd xmm7,xmm4 ; xmm7=data0H + psubd xmm2,xmm3 ; xmm2=data7L + psubd xmm0,xmm4 ; xmm0=data7H + + movdqa xmm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm3=[PD_DESCALE_P1] + + paddd xmm5,xmm3 + paddd xmm7,xmm3 + psrad xmm5,DESCALE_P1 + psrad xmm7,DESCALE_P1 + paddd xmm2,xmm3 + paddd xmm0,xmm3 + psrad xmm2,DESCALE_P1 + psrad xmm0,DESCALE_P1 + + packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) + packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) + + movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L + movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H + + movdqa xmm7,xmm4 + movdqa xmm0,xmm3 + paddd xmm4,xmm1 ; xmm4=data1L + paddd xmm3,xmm6 ; xmm3=data1H + psubd xmm7,xmm1 ; xmm7=data6L + psubd xmm0,xmm6 ; xmm0=data6H + + movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm1=[PD_DESCALE_P1] + + paddd xmm4,xmm1 + paddd xmm3,xmm1 + psrad xmm4,DESCALE_P1 + psrad xmm3,DESCALE_P1 + paddd xmm7,xmm1 + paddd xmm0,xmm1 + psrad xmm7,DESCALE_P1 + psrad xmm0,DESCALE_P1 + + packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) + + movdqa xmm6,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13) + punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm1,xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73) + punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77) + + movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L + movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H + movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L + movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17) + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) + + movdqa xmm5,xmm3 + movdqa xmm6,xmm0 + paddd xmm3,xmm4 ; xmm3=data2L + paddd xmm0,xmm2 ; xmm0=data2H + psubd xmm5,xmm4 ; xmm5=data5L + psubd xmm6,xmm2 ; xmm6=data5H + + movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm7=[PD_DESCALE_P1] + + paddd xmm3,xmm7 + paddd xmm0,xmm7 + psrad xmm3,DESCALE_P1 + psrad xmm0,DESCALE_P1 + paddd xmm5,xmm7 + paddd xmm6,xmm7 + psrad xmm5,DESCALE_P1 + psrad xmm6,DESCALE_P1 + + packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) + packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L + movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H + movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L + movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H + + movdqa xmm0,xmm1 + movdqa xmm6,xmm4 + paddd xmm1,xmm2 ; xmm1=data3L + paddd xmm4,xmm7 ; xmm4=data3H + psubd xmm0,xmm2 ; xmm0=data4L + psubd xmm6,xmm7 ; xmm6=data4H + + movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm2=[PD_DESCALE_P1] + + paddd xmm1,xmm2 + paddd xmm4,xmm2 + psrad xmm1,DESCALE_P1 + psrad xmm4,DESCALE_P1 + paddd xmm0,xmm2 + paddd xmm6,xmm2 + psrad xmm0,DESCALE_P1 + psrad xmm6,DESCALE_P1 + + packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) + packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) + movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) + + movdqa xmm4,xmm3 ; transpose coefficients(phase 1) + punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33) + punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm6,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53) + punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57) + + movdqa xmm1,xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31) + punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33) + movdqa xmm5,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35) + punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) + movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) + + movdqa xmm2,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71) + punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75) + punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77) + + movdqa xmm3,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) + movdqa xmm4,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 + + movdqa xmm3,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) + movdqa xmm4,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + ; -- Even part + + ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6 + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm6,xmm1 ; xmm1=in2=z2 + movdqa xmm5,xmm1 + punpcklwd xmm6,xmm2 ; xmm2=in6=z3 + punpckhwd xmm5,xmm2 + movdqa xmm1,xmm6 + movdqa xmm2,xmm5 + pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=tmp3L + pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H + pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L + pmaddwd xmm2,[GOTOFF(ebx,PW_F054_MF130)] ; xmm2=tmp2H + + movdqa xmm3,xmm7 + paddw xmm7,xmm0 ; xmm7=in0+in4 + psubw xmm3,xmm0 ; xmm3=in0-in4 + + pxor xmm4,xmm4 + pxor xmm0,xmm0 + punpcklwd xmm4,xmm7 ; xmm4=tmp0L + punpckhwd xmm0,xmm7 ; xmm0=tmp0H + psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS + + movdqa xmm7,xmm4 + paddd xmm4,xmm6 ; xmm4=tmp10L + psubd xmm7,xmm6 ; xmm7=tmp13L + movdqa xmm6,xmm0 + paddd xmm0,xmm5 ; xmm0=tmp10H + psubd xmm6,xmm5 ; xmm6=tmp13H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H + + pxor xmm5,xmm5 + pxor xmm4,xmm4 + punpcklwd xmm5,xmm3 ; xmm5=tmp1L + punpckhwd xmm4,xmm3 ; xmm4=tmp1H + psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + + movdqa xmm0,xmm5 + paddd xmm5,xmm1 ; xmm5=tmp11L + psubd xmm0,xmm1 ; xmm0=tmp12L + movdqa xmm7,xmm4 + paddd xmm4,xmm2 ; xmm4=tmp11H + psubd xmm7,xmm2 ; xmm7=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3 + movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1 + movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 + movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 + + movdqa xmm5,xmm6 + movdqa xmm4,xmm3 + paddw xmm5,xmm1 ; xmm5=z3 + paddw xmm4,xmm2 ; xmm4=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm0,xmm5 + movdqa xmm7,xmm5 + punpcklwd xmm0,xmm4 + punpckhwd xmm7,xmm4 + movdqa xmm5,xmm0 + movdqa xmm4,xmm7 + pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3L + pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3H + pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L + pmaddwd xmm4,[GOTOFF(ebx,PW_F117_F078)] ; xmm4=z4H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm0,xmm1 + movdqa xmm7,xmm1 + punpcklwd xmm0,xmm3 + punpckhwd xmm7,xmm3 + movdqa xmm1,xmm0 + movdqa xmm3,xmm7 + pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0L + pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp0H + pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp3L + pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3H + + paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L + paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H + paddd xmm1,xmm5 ; xmm1=tmp3L + paddd xmm3,xmm4 ; xmm3=tmp3H + + movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H + + movdqa xmm0,xmm2 + movdqa xmm7,xmm2 + punpcklwd xmm0,xmm6 + punpckhwd xmm7,xmm6 + movdqa xmm2,xmm0 + movdqa xmm6,xmm7 + pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1L + pmaddwd xmm7,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm7=tmp1H + pmaddwd xmm2,[GOTOFF(ebx,PW_MF256_F050)] ; xmm2=tmp2L + pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H + + paddd xmm0,xmm5 ; xmm0=tmp1L + paddd xmm7,xmm4 ; xmm7=tmp1H + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H + + movdqa xmm0,xmm5 + movdqa xmm7,xmm4 + paddd xmm5,xmm1 ; xmm5=data0L + paddd xmm4,xmm3 ; xmm4=data0H + psubd xmm0,xmm1 ; xmm0=data7L + psubd xmm7,xmm3 ; xmm7=data7H + + movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm1=[PD_DESCALE_P2] + + paddd xmm5,xmm1 + paddd xmm4,xmm1 + psrad xmm5,DESCALE_P2 + psrad xmm4,DESCALE_P2 + paddd xmm0,xmm1 + paddd xmm7,xmm1 + psrad xmm0,DESCALE_P2 + psrad xmm7,DESCALE_P2 + + packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) + packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L + movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H + + movdqa xmm4,xmm3 + movdqa xmm7,xmm1 + paddd xmm3,xmm2 ; xmm3=data1L + paddd xmm1,xmm6 ; xmm1=data1H + psubd xmm4,xmm2 ; xmm4=data6L + psubd xmm7,xmm6 ; xmm7=data6H + + movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm2=[PD_DESCALE_P2] + + paddd xmm3,xmm2 + paddd xmm1,xmm2 + psrad xmm3,DESCALE_P2 + psrad xmm1,DESCALE_P2 + paddd xmm4,xmm2 + paddd xmm7,xmm2 + psrad xmm4,DESCALE_P2 + psrad xmm7,DESCALE_P2 + + packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) + packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) + + packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H + movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L + movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm4,xmm6 + movdqa xmm0,xmm2 + paddd xmm6,xmm1 ; xmm6=data2L + paddd xmm2,xmm7 ; xmm2=data2H + psubd xmm4,xmm1 ; xmm4=data5L + psubd xmm0,xmm7 ; xmm0=data5H + + movdqa xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm5=[PD_DESCALE_P2] + + paddd xmm6,xmm5 + paddd xmm2,xmm5 + psrad xmm6,DESCALE_P2 + psrad xmm2,DESCALE_P2 + paddd xmm4,xmm5 + paddd xmm0,xmm5 + psrad xmm4,DESCALE_P2 + psrad xmm0,DESCALE_P2 + + packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) + packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) + + movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L + movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H + movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L + movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H + + movdqa xmm2,xmm3 + movdqa xmm0,xmm1 + paddd xmm3,xmm7 ; xmm3=data3L + paddd xmm1,xmm5 ; xmm1=data3H + psubd xmm2,xmm7 ; xmm2=data4L + psubd xmm0,xmm5 ; xmm0=data4H + + movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm7=[PD_DESCALE_P2] + + paddd xmm3,xmm7 + paddd xmm1,xmm7 + psrad xmm3,DESCALE_P2 + psrad xmm1,DESCALE_P2 + paddd xmm2,xmm7 + paddd xmm0,xmm7 + psrad xmm2,DESCALE_P2 + psrad xmm0,DESCALE_P2 + + movdqa xmm5,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm5=[PB_CENTERJSAMP] + + packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) + packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm7,xmm5 + paddb xmm1,xmm5 + paddb xmm6,xmm5 + paddb xmm3,xmm5 + + movdqa xmm0,xmm7 ; transpose coefficients(phase 1) + punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm2,xmm6 ; transpose coefficients(phase 1) + punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4,xmm7 ; transpose coefficients(phase 2) + punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm5,xmm2 ; transpose coefficients(phase 2) + punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm1,xmm7 ; transpose coefficients(phase 3) + punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm3,xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1 + mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 + + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 + mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5 + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jidctred-mmx.asm b/media/libjpeg/simd/jidctred-mmx.asm new file mode 100644 index 000000000..ba054e31a --- /dev/null +++ b/media/libjpeg/simd/jidctred-mmx.asm @@ -0,0 +1,705 @@ +; +; jidctred.asm - reduced-size IDCT (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains inverse-DCT routines that produce reduced-size +; output: either 4x4 or 2x2 pixels from an 8x8 DCT block. +; The following code is based directly on the IJG's original jidctred.c; +; see the jidctred.c for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1) +%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1) +%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2) +%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2) + +%if CONST_BITS == 13 +F_0_211 equ 1730 ; FIX(0.211164243) +F_0_509 equ 4176 ; FIX(0.509795579) +F_0_601 equ 4926 ; FIX(0.601344887) +F_0_720 equ 5906 ; FIX(0.720959822) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_850 equ 6967 ; FIX(0.850430095) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_061 equ 8697 ; FIX(1.061594337) +F_1_272 equ 10426 ; FIX(1.272758580) +F_1_451 equ 11893 ; FIX(1.451774981) +F_1_847 equ 15137 ; FIX(1.847759065) +F_2_172 equ 17799 ; FIX(2.172734803) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_624 equ 29692 ; FIX(3.624509785) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243) +F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579) +F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887) +F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337) +F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580) +F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_idct_red_mmx) + +EXTN(jconst_idct_red_mmx): + +PW_F184_MF076 times 2 dw F_1_847,-F_0_765 +PW_F256_F089 times 2 dw F_2_562, F_0_899 +PW_F106_MF217 times 2 dw F_1_061,-F_2_172 +PW_MF060_MF050 times 2 dw -F_0_601,-F_0_509 +PW_F145_MF021 times 2 dw F_1_451,-F_0_211 +PW_F362_MF127 times 2 dw F_3_624,-F_1_272 +PW_F085_MF072 times 2 dw F_0_850,-F_0_720 +PD_DESCALE_P1_4 times 2 dd 1 << (DESCALE_P1_4-1) +PD_DESCALE_P2_4 times 2 dd 1 << (DESCALE_P2_4-1) +PD_DESCALE_P1_2 times 2 dd 1 << (DESCALE_P1_2-1) +PD_DESCALE_P2_2 times 2 dd 1 << (DESCALE_P2_2-1) +PB_CENTERJSAMP times 8 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients, +; producing a reduced-size 4x4 output block. +; +; GLOBAL(void) +; jsimd_idct_4x4_mmx (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b)+8 ; void *dct_table +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF + ; JCOEF workspace[DCTSIZE2] + + align 16 + global EXTN(jsimd_idct_4x4_mmx) + +EXTN(jsimd_idct_4x4_mmx): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [workspace] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; JCOEF *wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 +.columnloop: +%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por mm0,mm1 + packsswb mm0,mm0 + movd eax,mm0 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw mm0,PASS1_BITS + + movq mm2,mm0 ; mm0=in0=(00 01 02 03) + punpcklwd mm0,mm0 ; mm0=(00 00 01 01) + punpckhwd mm2,mm2 ; mm2=(02 02 03 03) + + movq mm1,mm0 + punpckldq mm0,mm0 ; mm0=(00 00 00 00) + punpckhdq mm1,mm1 ; mm1=(01 01 01 01) + movq mm3,mm2 + punpckldq mm2,mm2 ; mm2=(02 02 02 02) + punpckhdq mm3,mm3 ; mm3=(03 03 03 03) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 + jmp near .nextcolumn + alignx 16,7 +%endif +.columnDCT: + + ; -- Odd part + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movq mm4,mm0 + movq mm5,mm0 + punpcklwd mm4,mm1 + punpckhwd mm5,mm1 + movq mm0,mm4 + movq mm1,mm5 + pmaddwd mm4,[GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L) + pmaddwd mm5,[GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H) + pmaddwd mm0,[GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L) + pmaddwd mm1,[GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H) + + movq mm6,mm2 + movq mm7,mm2 + punpcklwd mm6,mm3 + punpckhwd mm7,mm3 + movq mm2,mm6 + movq mm3,mm7 + pmaddwd mm6,[GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L) + pmaddwd mm7,[GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H) + pmaddwd mm2,[GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L) + pmaddwd mm3,[GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H) + + paddd mm6,mm4 ; mm6=tmp2L + paddd mm7,mm5 ; mm7=tmp2H + paddd mm2,mm0 ; mm2=tmp0L + paddd mm3,mm1 ; mm3=tmp0H + + movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L + movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H + + ; -- Even part + + movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + pxor mm1,mm1 + pxor mm2,mm2 + punpcklwd mm1,mm4 ; mm1=tmp0L + punpckhwd mm2,mm4 ; mm2=tmp0H + psrad mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1 + psrad mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1 + + movq mm3,mm5 ; mm5=in2=z2 + punpcklwd mm5,mm0 ; mm0=in6=z3 + punpckhwd mm3,mm0 + pmaddwd mm5,[GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L + pmaddwd mm3,[GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H + + movq mm4,mm1 + movq mm0,mm2 + paddd mm1,mm5 ; mm1=tmp10L + paddd mm2,mm3 ; mm2=tmp10H + psubd mm4,mm5 ; mm4=tmp12L + psubd mm0,mm3 ; mm0=tmp12H + + ; -- Final output stage + + movq mm5,mm1 + movq mm3,mm2 + paddd mm1,mm6 ; mm1=data0L + paddd mm2,mm7 ; mm2=data0H + psubd mm5,mm6 ; mm5=data3L + psubd mm3,mm7 ; mm3=data3H + + movq mm6,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm6=[PD_DESCALE_P1_4] + + paddd mm1,mm6 + paddd mm2,mm6 + psrad mm1,DESCALE_P1_4 + psrad mm2,DESCALE_P1_4 + paddd mm5,mm6 + paddd mm3,mm6 + psrad mm5,DESCALE_P1_4 + psrad mm3,DESCALE_P1_4 + + packssdw mm1,mm2 ; mm1=data0=(00 01 02 03) + packssdw mm5,mm3 ; mm5=data3=(30 31 32 33) + + movq mm7, MMWORD [wk(0)] ; mm7=tmp0L + movq mm6, MMWORD [wk(1)] ; mm6=tmp0H + + movq mm2,mm4 + movq mm3,mm0 + paddd mm4,mm7 ; mm4=data1L + paddd mm0,mm6 ; mm0=data1H + psubd mm2,mm7 ; mm2=data2L + psubd mm3,mm6 ; mm3=data2H + + movq mm7,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm7=[PD_DESCALE_P1_4] + + paddd mm4,mm7 + paddd mm0,mm7 + psrad mm4,DESCALE_P1_4 + psrad mm0,DESCALE_P1_4 + paddd mm2,mm7 + paddd mm3,mm7 + psrad mm2,DESCALE_P1_4 + psrad mm3,DESCALE_P1_4 + + packssdw mm4,mm0 ; mm4=data1=(10 11 12 13) + packssdw mm2,mm3 ; mm2=data2=(20 21 22 23) + + movq mm6,mm1 ; transpose coefficients(phase 1) + punpcklwd mm1,mm4 ; mm1=(00 10 01 11) + punpckhwd mm6,mm4 ; mm6=(02 12 03 13) + movq mm7,mm2 ; transpose coefficients(phase 1) + punpcklwd mm2,mm5 ; mm2=(20 30 21 31) + punpckhwd mm7,mm5 ; mm7=(22 32 23 33) + + movq mm0,mm1 ; transpose coefficients(phase 2) + punpckldq mm1,mm2 ; mm1=(00 10 20 30) + punpckhdq mm0,mm2 ; mm0=(01 11 21 31) + movq mm3,mm6 ; transpose coefficients(phase 2) + punpckldq mm6,mm7 ; mm6=(02 12 22 32) + punpckhdq mm3,mm7 ; mm3=(03 13 23 33) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 + +.nextcolumn: + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr + add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; JCOEF *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + ; -- Odd part + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + movq mm4,mm0 + movq mm5,mm0 + punpcklwd mm4,mm1 + punpckhwd mm5,mm1 + movq mm0,mm4 + movq mm1,mm5 + pmaddwd mm4,[GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L) + pmaddwd mm5,[GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H) + pmaddwd mm0,[GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L) + pmaddwd mm1,[GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H) + + movq mm6,mm2 + movq mm7,mm2 + punpcklwd mm6,mm3 + punpckhwd mm7,mm3 + movq mm2,mm6 + movq mm3,mm7 + pmaddwd mm6,[GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L) + pmaddwd mm7,[GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H) + pmaddwd mm2,[GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L) + pmaddwd mm3,[GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H) + + paddd mm6,mm4 ; mm6=tmp2L + paddd mm7,mm5 ; mm7=tmp2H + paddd mm2,mm0 ; mm2=tmp0L + paddd mm3,mm1 ; mm3=tmp0H + + movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L + movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H + + ; -- Even part + + movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + pxor mm1,mm1 + pxor mm2,mm2 + punpcklwd mm1,mm4 ; mm1=tmp0L + punpckhwd mm2,mm4 ; mm2=tmp0H + psrad mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1 + psrad mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1 + + movq mm3,mm5 ; mm5=in2=z2 + punpcklwd mm5,mm0 ; mm0=in6=z3 + punpckhwd mm3,mm0 + pmaddwd mm5,[GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L + pmaddwd mm3,[GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H + + movq mm4,mm1 + movq mm0,mm2 + paddd mm1,mm5 ; mm1=tmp10L + paddd mm2,mm3 ; mm2=tmp10H + psubd mm4,mm5 ; mm4=tmp12L + psubd mm0,mm3 ; mm0=tmp12H + + ; -- Final output stage + + movq mm5,mm1 + movq mm3,mm2 + paddd mm1,mm6 ; mm1=data0L + paddd mm2,mm7 ; mm2=data0H + psubd mm5,mm6 ; mm5=data3L + psubd mm3,mm7 ; mm3=data3H + + movq mm6,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm6=[PD_DESCALE_P2_4] + + paddd mm1,mm6 + paddd mm2,mm6 + psrad mm1,DESCALE_P2_4 + psrad mm2,DESCALE_P2_4 + paddd mm5,mm6 + paddd mm3,mm6 + psrad mm5,DESCALE_P2_4 + psrad mm3,DESCALE_P2_4 + + packssdw mm1,mm2 ; mm1=data0=(00 10 20 30) + packssdw mm5,mm3 ; mm5=data3=(03 13 23 33) + + movq mm7, MMWORD [wk(0)] ; mm7=tmp0L + movq mm6, MMWORD [wk(1)] ; mm6=tmp0H + + movq mm2,mm4 + movq mm3,mm0 + paddd mm4,mm7 ; mm4=data1L + paddd mm0,mm6 ; mm0=data1H + psubd mm2,mm7 ; mm2=data2L + psubd mm3,mm6 ; mm3=data2H + + movq mm7,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm7=[PD_DESCALE_P2_4] + + paddd mm4,mm7 + paddd mm0,mm7 + psrad mm4,DESCALE_P2_4 + psrad mm0,DESCALE_P2_4 + paddd mm2,mm7 + paddd mm3,mm7 + psrad mm2,DESCALE_P2_4 + psrad mm3,DESCALE_P2_4 + + packssdw mm4,mm0 ; mm4=data1=(01 11 21 31) + packssdw mm2,mm3 ; mm2=data2=(02 12 22 32) + + movq mm6,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP] + + packsswb mm1,mm2 ; mm1=(00 10 20 30 02 12 22 32) + packsswb mm4,mm5 ; mm4=(01 11 21 31 03 13 23 33) + paddb mm1,mm6 + paddb mm4,mm6 + + movq mm7,mm1 ; transpose coefficients(phase 1) + punpcklbw mm1,mm4 ; mm1=(00 01 10 11 20 21 30 31) + punpckhbw mm7,mm4 ; mm7=(02 03 12 13 22 23 32 33) + + movq mm0,mm1 ; transpose coefficients(phase 2) + punpcklwd mm1,mm7 ; mm1=(00 01 02 03 10 11 12 13) + punpckhwd mm0,mm7 ; mm0=(20 21 22 23 30 31 32 33) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + movd DWORD [edx+eax*SIZEOF_JSAMPLE], mm1 + movd DWORD [esi+eax*SIZEOF_JSAMPLE], mm0 + + psrlq mm1,4*BYTE_BIT + psrlq mm0,4*BYTE_BIT + + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movd DWORD [edx+eax*SIZEOF_JSAMPLE], mm1 + movd DWORD [esi+eax*SIZEOF_JSAMPLE], mm0 + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + + +; -------------------------------------------------------------------------- +; +; Perform dequantization and inverse DCT on one block of coefficients, +; producing a reduced-size 2x2 output block. +; +; GLOBAL(void) +; jsimd_idct_2x2_mmx (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b)+8 ; void *dct_table +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col + + align 16 + global EXTN(jsimd_idct_2x2_mmx) + +EXTN(jsimd_idct_2x2_mmx): + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + + mov edx, POINTER [dct_table(ebp)] ; quantptr + mov esi, JCOEFPTR [coef_block(ebp)] ; inptr + + ; | input: | result: | + ; | 00 01 ** 03 ** 05 ** 07 | | + ; | 10 11 ** 13 ** 15 ** 17 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | + ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | + ; | 50 51 ** 53 ** 55 ** 57 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 70 71 ** 73 ** 75 ** 77 | | + + ; -- Odd part + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; mm0=(10 11 ** 13), mm1=(30 31 ** 33) + ; mm2=(50 51 ** 53), mm3=(70 71 ** 73) + + pcmpeqd mm7,mm7 + pslld mm7,WORD_BIT ; mm7={0x0000 0xFFFF 0x0000 0xFFFF} + + movq mm4,mm0 ; mm4=(10 11 ** 13) + movq mm5,mm2 ; mm5=(50 51 ** 53) + punpcklwd mm4,mm1 ; mm4=(10 30 11 31) + punpcklwd mm5,mm3 ; mm5=(50 70 51 71) + pmaddwd mm4,[GOTOFF(ebx,PW_F362_MF127)] + pmaddwd mm5,[GOTOFF(ebx,PW_F085_MF072)] + + psrld mm0,WORD_BIT ; mm0=(11 -- 13 --) + pand mm1,mm7 ; mm1=(-- 31 -- 33) + psrld mm2,WORD_BIT ; mm2=(51 -- 53 --) + pand mm3,mm7 ; mm3=(-- 71 -- 73) + por mm0,mm1 ; mm0=(11 31 13 33) + por mm2,mm3 ; mm2=(51 71 53 73) + pmaddwd mm0,[GOTOFF(ebx,PW_F362_MF127)] + pmaddwd mm2,[GOTOFF(ebx,PW_F085_MF072)] + + paddd mm4,mm5 ; mm4=tmp0[col0 col1] + + movq mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)] + pmullw mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)] + pmullw mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; mm6=(** 15 ** 17), mm1=(** 35 ** 37) + ; mm3=(** 55 ** 57), mm5=(** 75 ** 77) + + psrld mm6,WORD_BIT ; mm6=(15 -- 17 --) + pand mm1,mm7 ; mm1=(-- 35 -- 37) + psrld mm3,WORD_BIT ; mm3=(55 -- 57 --) + pand mm5,mm7 ; mm5=(-- 75 -- 77) + por mm6,mm1 ; mm6=(15 35 17 37) + por mm3,mm5 ; mm3=(55 75 57 77) + pmaddwd mm6,[GOTOFF(ebx,PW_F362_MF127)] + pmaddwd mm3,[GOTOFF(ebx,PW_F085_MF072)] + + paddd mm0,mm2 ; mm0=tmp0[col1 col3] + paddd mm6,mm3 ; mm6=tmp0[col5 col7] + + ; -- Even part + + movq mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)] + pmullw mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; mm1=(00 01 ** 03), mm5=(** 05 ** 07) + + movq mm2,mm1 ; mm2=(00 01 ** 03) + pslld mm1,WORD_BIT ; mm1=(-- 00 -- **) + psrad mm1,(WORD_BIT-CONST_BITS-2) ; mm1=tmp10[col0 ****] + + pand mm2,mm7 ; mm2=(-- 01 -- 03) + pand mm5,mm7 ; mm5=(-- 05 -- 07) + psrad mm2,(WORD_BIT-CONST_BITS-2) ; mm2=tmp10[col1 col3] + psrad mm5,(WORD_BIT-CONST_BITS-2) ; mm5=tmp10[col5 col7] + + ; -- Final output stage + + movq mm3,mm1 + paddd mm1,mm4 ; mm1=data0[col0 ****]=(A0 **) + psubd mm3,mm4 ; mm3=data1[col0 ****]=(B0 **) + punpckldq mm1,mm3 ; mm1=(A0 B0) + + movq mm7,[GOTOFF(ebx,PD_DESCALE_P1_2)] ; mm7=[PD_DESCALE_P1_2] + + movq mm4,mm2 + movq mm3,mm5 + paddd mm2,mm0 ; mm2=data0[col1 col3]=(A1 A3) + paddd mm5,mm6 ; mm5=data0[col5 col7]=(A5 A7) + psubd mm4,mm0 ; mm4=data1[col1 col3]=(B1 B3) + psubd mm3,mm6 ; mm3=data1[col5 col7]=(B5 B7) + + paddd mm1,mm7 + psrad mm1,DESCALE_P1_2 + + paddd mm2,mm7 + paddd mm5,mm7 + psrad mm2,DESCALE_P1_2 + psrad mm5,DESCALE_P1_2 + paddd mm4,mm7 + paddd mm3,mm7 + psrad mm4,DESCALE_P1_2 + psrad mm3,DESCALE_P1_2 + + ; ---- Pass 2: process rows, store into output array. + + mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(ebp)] + + ; | input:| result:| + ; | A0 B0 | | + ; | A1 B1 | C0 C1 | + ; | A3 B3 | D0 D1 | + ; | A5 B5 | | + ; | A7 B7 | | + + ; -- Odd part + + packssdw mm2,mm4 ; mm2=(A1 A3 B1 B3) + packssdw mm5,mm3 ; mm5=(A5 A7 B5 B7) + pmaddwd mm2,[GOTOFF(ebx,PW_F362_MF127)] + pmaddwd mm5,[GOTOFF(ebx,PW_F085_MF072)] + + paddd mm2,mm5 ; mm2=tmp0[row0 row1] + + ; -- Even part + + pslld mm1,(CONST_BITS+2) ; mm1=tmp10[row0 row1] + + ; -- Final output stage + + movq mm0,[GOTOFF(ebx,PD_DESCALE_P2_2)] ; mm0=[PD_DESCALE_P2_2] + + movq mm6,mm1 + paddd mm1,mm2 ; mm1=data0[row0 row1]=(C0 C1) + psubd mm6,mm2 ; mm6=data1[row0 row1]=(D0 D1) + + paddd mm1,mm0 + paddd mm6,mm0 + psrad mm1,DESCALE_P2_2 + psrad mm6,DESCALE_P2_2 + + movq mm7,mm1 ; transpose coefficients + punpckldq mm1,mm6 ; mm1=(C0 D0) + punpckhdq mm7,mm6 ; mm7=(C1 D1) + + packssdw mm1,mm7 ; mm1=(C0 D0 C1 D1) + packsswb mm1,mm1 ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1) + paddb mm1,[GOTOFF(ebx,PB_CENTERJSAMP)] + + movd ecx,mm1 + movd ebx,mm1 ; ebx=(C0 D0 C1 D1) + shr ecx,2*BYTE_BIT ; ecx=(C1 D1 -- --) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov WORD [edx+eax*SIZEOF_JSAMPLE], bx + mov WORD [esi+eax*SIZEOF_JSAMPLE], cx + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jidctred-sse2-64.asm b/media/libjpeg/simd/jidctred-sse2-64.asm new file mode 100644 index 000000000..a54bbe24e --- /dev/null +++ b/media/libjpeg/simd/jidctred-sse2-64.asm @@ -0,0 +1,575 @@ +; +; jidctred.asm - reduced-size IDCT (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains inverse-DCT routines that produce reduced-size +; output: either 4x4 or 2x2 pixels from an 8x8 DCT block. +; The following code is based directly on the IJG's original jidctred.c; +; see the jidctred.c for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1) +%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1) +%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2) +%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2) + +%if CONST_BITS == 13 +F_0_211 equ 1730 ; FIX(0.211164243) +F_0_509 equ 4176 ; FIX(0.509795579) +F_0_601 equ 4926 ; FIX(0.601344887) +F_0_720 equ 5906 ; FIX(0.720959822) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_850 equ 6967 ; FIX(0.850430095) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_061 equ 8697 ; FIX(1.061594337) +F_1_272 equ 10426 ; FIX(1.272758580) +F_1_451 equ 11893 ; FIX(1.451774981) +F_1_847 equ 15137 ; FIX(1.847759065) +F_2_172 equ 17799 ; FIX(2.172734803) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_624 equ 29692 ; FIX(3.624509785) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243) +F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579) +F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887) +F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337) +F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580) +F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_idct_red_sse2) + +EXTN(jconst_idct_red_sse2): + +PW_F184_MF076 times 4 dw F_1_847,-F_0_765 +PW_F256_F089 times 4 dw F_2_562, F_0_899 +PW_F106_MF217 times 4 dw F_1_061,-F_2_172 +PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509 +PW_F145_MF021 times 4 dw F_1_451,-F_0_211 +PW_F362_MF127 times 4 dw F_3_624,-F_1_272 +PW_F085_MF072 times 4 dw F_0_850,-F_0_720 +PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1) +PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1) +PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1) +PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Perform dequantization and inverse DCT on one block of coefficients, +; producing a reduced-size 4x4 output block. +; +; GLOBAL(void) +; jsimd_idct_4x4_sse2 (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +; r10 = void *dct_table +; r11 = JCOEFPTR coef_block +; r12 = JSAMPARRAY output_buf +; r13 = JDIMENSION output_col + +%define original_rbp rbp+0 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + global EXTN(jsimd_idct_4x4_sse2) + +EXTN(jsimd_idct_4x4_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + + ; ---- Pass 1: process columns from input. + + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2 + mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + por xmm0,xmm1 + packsswb xmm0,xmm0 + packsswb xmm0,xmm0 + movd eax,xmm0 + test rax,rax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm0,PASS1_BITS + + movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07) + + pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01) + pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03) + pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05) + pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) + + jmp near .column_end +%endif +.columnDCT: + + ; -- Odd part + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm4,xmm0 + movdqa xmm5,xmm0 + punpcklwd xmm4,xmm1 + punpckhwd xmm5,xmm1 + movdqa xmm0,xmm4 + movdqa xmm1,xmm5 + pmaddwd xmm4,[rel PW_F256_F089] ; xmm4=(tmp2L) + pmaddwd xmm5,[rel PW_F256_F089] ; xmm5=(tmp2H) + pmaddwd xmm0,[rel PW_F106_MF217] ; xmm0=(tmp0L) + pmaddwd xmm1,[rel PW_F106_MF217] ; xmm1=(tmp0H) + + movdqa xmm6,xmm2 + movdqa xmm7,xmm2 + punpcklwd xmm6,xmm3 + punpckhwd xmm7,xmm3 + movdqa xmm2,xmm6 + movdqa xmm3,xmm7 + pmaddwd xmm6,[rel PW_MF060_MF050] ; xmm6=(tmp2L) + pmaddwd xmm7,[rel PW_MF060_MF050] ; xmm7=(tmp2H) + pmaddwd xmm2,[rel PW_F145_MF021] ; xmm2=(tmp0L) + pmaddwd xmm3,[rel PW_F145_MF021] ; xmm3=(tmp0H) + + paddd xmm6,xmm4 ; xmm6=tmp2L + paddd xmm7,xmm5 ; xmm7=tmp2H + paddd xmm2,xmm0 ; xmm2=tmp0L + paddd xmm3,xmm1 ; xmm3=tmp0H + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H + + ; -- Even part + + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + movdqa xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + movdqa xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + pxor xmm1,xmm1 + pxor xmm2,xmm2 + punpcklwd xmm1,xmm4 ; xmm1=tmp0L + punpckhwd xmm2,xmm4 ; xmm2=tmp0H + psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1 + psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1 + + movdqa xmm3,xmm5 ; xmm5=in2=z2 + punpcklwd xmm5,xmm0 ; xmm0=in6=z3 + punpckhwd xmm3,xmm0 + pmaddwd xmm5,[rel PW_F184_MF076] ; xmm5=tmp2L + pmaddwd xmm3,[rel PW_F184_MF076] ; xmm3=tmp2H + + movdqa xmm4,xmm1 + movdqa xmm0,xmm2 + paddd xmm1,xmm5 ; xmm1=tmp10L + paddd xmm2,xmm3 ; xmm2=tmp10H + psubd xmm4,xmm5 ; xmm4=tmp12L + psubd xmm0,xmm3 ; xmm0=tmp12H + + ; -- Final output stage + + movdqa xmm5,xmm1 + movdqa xmm3,xmm2 + paddd xmm1,xmm6 ; xmm1=data0L + paddd xmm2,xmm7 ; xmm2=data0H + psubd xmm5,xmm6 ; xmm5=data3L + psubd xmm3,xmm7 ; xmm3=data3H + + movdqa xmm6,[rel PD_DESCALE_P1_4] ; xmm6=[rel PD_DESCALE_P1_4] + + paddd xmm1,xmm6 + paddd xmm2,xmm6 + psrad xmm1,DESCALE_P1_4 + psrad xmm2,DESCALE_P1_4 + paddd xmm5,xmm6 + paddd xmm3,xmm6 + psrad xmm5,DESCALE_P1_4 + psrad xmm3,DESCALE_P1_4 + + packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07) + packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H + + movdqa xmm2,xmm4 + movdqa xmm3,xmm0 + paddd xmm4,xmm7 ; xmm4=data1L + paddd xmm0,xmm6 ; xmm0=data1H + psubd xmm2,xmm7 ; xmm2=data2L + psubd xmm3,xmm6 ; xmm3=data2H + + movdqa xmm7,[rel PD_DESCALE_P1_4] ; xmm7=[rel PD_DESCALE_P1_4] + + paddd xmm4,xmm7 + paddd xmm0,xmm7 + psrad xmm4,DESCALE_P1_4 + psrad xmm0,DESCALE_P1_4 + paddd xmm2,xmm7 + paddd xmm3,xmm7 + psrad xmm2,DESCALE_P1_4 + psrad xmm3,DESCALE_P1_4 + + packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27) + + movdqa xmm6,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13) + punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm7,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37) + + movdqa xmm0,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31) + punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33) + movdqa xmm3,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35) + punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37) +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows, store into output array. + + mov rax, [original_rbp] + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d + + ; -- Even part + + pxor xmm4,xmm4 + punpcklwd xmm4,xmm1 ; xmm4=tmp0 + psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1 + + ; -- Odd part + + punpckhwd xmm1,xmm0 + punpckhwd xmm6,xmm3 + movdqa xmm5,xmm1 + movdqa xmm2,xmm6 + pmaddwd xmm1,[rel PW_F256_F089] ; xmm1=(tmp2) + pmaddwd xmm6,[rel PW_MF060_MF050] ; xmm6=(tmp2) + pmaddwd xmm5,[rel PW_F106_MF217] ; xmm5=(tmp0) + pmaddwd xmm2,[rel PW_F145_MF021] ; xmm2=(tmp0) + + paddd xmm6,xmm1 ; xmm6=tmp2 + paddd xmm2,xmm5 ; xmm2=tmp0 + + ; -- Even part + + punpcklwd xmm0,xmm3 + pmaddwd xmm0,[rel PW_F184_MF076] ; xmm0=tmp2 + + movdqa xmm7,xmm4 + paddd xmm4,xmm0 ; xmm4=tmp10 + psubd xmm7,xmm0 ; xmm7=tmp12 + + ; -- Final output stage + + movdqa xmm1,[rel PD_DESCALE_P2_4] ; xmm1=[rel PD_DESCALE_P2_4] + + movdqa xmm5,xmm4 + movdqa xmm3,xmm7 + paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30) + paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31) + psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33) + psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32) + + paddd xmm4,xmm1 + paddd xmm7,xmm1 + psrad xmm4,DESCALE_P2_4 + psrad xmm7,DESCALE_P2_4 + paddd xmm5,xmm1 + paddd xmm3,xmm1 + psrad xmm5,DESCALE_P2_4 + psrad xmm3,DESCALE_P2_4 + + packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32) + packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33) + + movdqa xmm0,xmm4 ; transpose coefficients(phase 1) + punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31) + punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33) + + movdqa xmm6,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13) + punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33) + + packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..) + paddb xmm4,[rel PB_CENTERJSAMP] + + pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..) + pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..) + pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..) + + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 + movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 + mov rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] + movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 + movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 + + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + + +; -------------------------------------------------------------------------- +; +; Perform dequantization and inverse DCT on one block of coefficients, +; producing a reduced-size 2x2 output block. +; +; GLOBAL(void) +; jsimd_idct_2x2_sse2 (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +; r10 = void *dct_table +; r11 = JCOEFPTR coef_block +; r12 = JSAMPARRAY output_buf +; r13 = JDIMENSION output_col + + align 16 + global EXTN(jsimd_idct_2x2_sse2) + +EXTN(jsimd_idct_2x2_sse2): + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + push rbx + + ; ---- Pass 1: process columns from input. + + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr + + ; | input: | result: | + ; | 00 01 ** 03 ** 05 ** 07 | | + ; | 10 11 ** 13 ** 15 ** 17 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | + ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | + ; | 50 51 ** 53 ** 55 ** 57 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 70 71 ** 73 ** 75 ** 77 | | + + ; -- Odd part + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37) + ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77) + + pcmpeqd xmm7,xmm7 + pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..} + + movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17) + movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57) + punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33) + punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73) + pmaddwd xmm4,[rel PW_F362_MF127] + pmaddwd xmm5,[rel PW_F085_MF072] + + psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --) + pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37) + psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --) + pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77) + por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37) + por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77) + pmaddwd xmm0,[rel PW_F362_MF127] + pmaddwd xmm2,[rel PW_F085_MF072] + + paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3] + paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7] + + ; -- Even part + + movdqa xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + pmullw xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + ; xmm6=(00 01 ** 03 ** 05 ** 07) + + movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07) + pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **) + pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07) + psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****] + psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7] + + ; -- Final output stage + + movdqa xmm3,xmm6 + movdqa xmm5,xmm1 + paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **) + paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7) + psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **) + psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7) + + movdqa xmm2,[rel PD_DESCALE_P1_2] ; xmm2=[rel PD_DESCALE_P1_2] + + punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **) + + movdqa xmm7,xmm1 + punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3) + punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7) + + paddd xmm6,xmm2 + psrad xmm6,DESCALE_P1_2 + + paddd xmm1,xmm2 + paddd xmm7,xmm2 + psrad xmm1,DESCALE_P1_2 + psrad xmm7,DESCALE_P1_2 + + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows, store into output array. + + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d + + ; | input:| result:| + ; | A0 B0 | | + ; | A1 B1 | C0 C1 | + ; | A3 B3 | D0 D1 | + ; | A5 B5 | | + ; | A7 B7 | | + + ; -- Odd part + + packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3) + packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7) + pmaddwd xmm1,[rel PW_F362_MF127] + pmaddwd xmm7,[rel PW_F085_MF072] + + paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1] + + ; -- Even part + + pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****] + + ; -- Final output stage + + movdqa xmm4,xmm6 + paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **) + psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **) + + punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1) + + paddd xmm6,[rel PD_DESCALE_P2_2] + psrad xmm6,DESCALE_P2_2 + + packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1) + packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..) + paddb xmm6,[rel PB_CENTERJSAMP] + + pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --) + pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --) + + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + mov WORD [rdx+rax*SIZEOF_JSAMPLE], bx + mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx + + pop rbx + uncollect_args + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jidctred-sse2.asm b/media/libjpeg/simd/jidctred-sse2.asm new file mode 100644 index 000000000..232d9838d --- /dev/null +++ b/media/libjpeg/simd/jidctred-sse2.asm @@ -0,0 +1,593 @@ +; +; jidctred.asm - reduced-size IDCT (SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; This file contains inverse-DCT routines that produce reduced-size +; output: either 4x4 or 2x2 pixels from an 8x8 DCT block. +; The following code is based directly on the IJG's original jidctred.c; +; see the jidctred.c for more details. +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + +%define CONST_BITS 13 +%define PASS1_BITS 2 + +%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1) +%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1) +%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2) +%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2) + +%if CONST_BITS == 13 +F_0_211 equ 1730 ; FIX(0.211164243) +F_0_509 equ 4176 ; FIX(0.509795579) +F_0_601 equ 4926 ; FIX(0.601344887) +F_0_720 equ 5906 ; FIX(0.720959822) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_850 equ 6967 ; FIX(0.850430095) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_061 equ 8697 ; FIX(1.061594337) +F_1_272 equ 10426 ; FIX(1.272758580) +F_1_451 equ 11893 ; FIX(1.451774981) +F_1_847 equ 15137 ; FIX(1.847759065) +F_2_172 equ 17799 ; FIX(2.172734803) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_624 equ 29692 ; FIX(3.624509785) +%else +; NASM cannot do compile-time arithmetic on floating-point constants. +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243) +F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579) +F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887) +F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337) +F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580) +F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) +%endif + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_idct_red_sse2) + +EXTN(jconst_idct_red_sse2): + +PW_F184_MF076 times 4 dw F_1_847,-F_0_765 +PW_F256_F089 times 4 dw F_2_562, F_0_899 +PW_F106_MF217 times 4 dw F_1_061,-F_2_172 +PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509 +PW_F145_MF021 times 4 dw F_1_451,-F_0_211 +PW_F362_MF127 times 4 dw F_3_624,-F_1_272 +PW_F085_MF072 times 4 dw F_0_850,-F_0_720 +PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1) +PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1) +PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1) +PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Perform dequantization and inverse DCT on one block of coefficients, +; producing a reduced-size 4x4 output block. +; +; GLOBAL(void) +; jsimd_idct_4x4_sse2 (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b)+8 ; void *dct_table +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + global EXTN(jsimd_idct_4x4_sse2) + +EXTN(jsimd_idct_4x4_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + +%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2 + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm0,xmm1 + packsswb xmm0,xmm0 + packsswb xmm0,xmm0 + movd eax,xmm0 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm0,PASS1_BITS + + movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07) + + pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01) + pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03) + pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05) + pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) + + jmp near .column_end + alignx 16,7 +%endif +.columnDCT: + + ; -- Odd part + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm4,xmm0 + movdqa xmm5,xmm0 + punpcklwd xmm4,xmm1 + punpckhwd xmm5,xmm1 + movdqa xmm0,xmm4 + movdqa xmm1,xmm5 + pmaddwd xmm4,[GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L) + pmaddwd xmm5,[GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H) + pmaddwd xmm0,[GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L) + pmaddwd xmm1,[GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H) + + movdqa xmm6,xmm2 + movdqa xmm7,xmm2 + punpcklwd xmm6,xmm3 + punpckhwd xmm7,xmm3 + movdqa xmm2,xmm6 + movdqa xmm3,xmm7 + pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L) + pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H) + pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L) + pmaddwd xmm3,[GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H) + + paddd xmm6,xmm4 ; xmm6=tmp2L + paddd xmm7,xmm5 ; xmm7=tmp2H + paddd xmm2,xmm0 ; xmm2=tmp0L + paddd xmm3,xmm1 ; xmm3=tmp0H + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H + + ; -- Even part + + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movdqa xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movdqa xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + pxor xmm1,xmm1 + pxor xmm2,xmm2 + punpcklwd xmm1,xmm4 ; xmm1=tmp0L + punpckhwd xmm2,xmm4 ; xmm2=tmp0H + psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1 + psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1 + + movdqa xmm3,xmm5 ; xmm5=in2=z2 + punpcklwd xmm5,xmm0 ; xmm0=in6=z3 + punpckhwd xmm3,xmm0 + pmaddwd xmm5,[GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L + pmaddwd xmm3,[GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H + + movdqa xmm4,xmm1 + movdqa xmm0,xmm2 + paddd xmm1,xmm5 ; xmm1=tmp10L + paddd xmm2,xmm3 ; xmm2=tmp10H + psubd xmm4,xmm5 ; xmm4=tmp12L + psubd xmm0,xmm3 ; xmm0=tmp12H + + ; -- Final output stage + + movdqa xmm5,xmm1 + movdqa xmm3,xmm2 + paddd xmm1,xmm6 ; xmm1=data0L + paddd xmm2,xmm7 ; xmm2=data0H + psubd xmm5,xmm6 ; xmm5=data3L + psubd xmm3,xmm7 ; xmm3=data3H + + movdqa xmm6,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4] + + paddd xmm1,xmm6 + paddd xmm2,xmm6 + psrad xmm1,DESCALE_P1_4 + psrad xmm2,DESCALE_P1_4 + paddd xmm5,xmm6 + paddd xmm3,xmm6 + psrad xmm5,DESCALE_P1_4 + psrad xmm3,DESCALE_P1_4 + + packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07) + packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H + + movdqa xmm2,xmm4 + movdqa xmm3,xmm0 + paddd xmm4,xmm7 ; xmm4=data1L + paddd xmm0,xmm6 ; xmm0=data1H + psubd xmm2,xmm7 ; xmm2=data2L + psubd xmm3,xmm6 ; xmm3=data2H + + movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4] + + paddd xmm4,xmm7 + paddd xmm0,xmm7 + psrad xmm4,DESCALE_P1_4 + psrad xmm0,DESCALE_P1_4 + paddd xmm2,xmm7 + paddd xmm3,xmm7 + psrad xmm2,DESCALE_P1_4 + psrad xmm3,DESCALE_P1_4 + + packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27) + + movdqa xmm6,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13) + punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm7,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37) + + movdqa xmm0,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31) + punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33) + movdqa xmm3,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35) + punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37) +.column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows, store into output array. + + mov eax, [original_ebp] + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + ; -- Even part + + pxor xmm4,xmm4 + punpcklwd xmm4,xmm1 ; xmm4=tmp0 + psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1 + + ; -- Odd part + + punpckhwd xmm1,xmm0 + punpckhwd xmm6,xmm3 + movdqa xmm5,xmm1 + movdqa xmm2,xmm6 + pmaddwd xmm1,[GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2) + pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2) + pmaddwd xmm5,[GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0) + pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0) + + paddd xmm6,xmm1 ; xmm6=tmp2 + paddd xmm2,xmm5 ; xmm2=tmp0 + + ; -- Even part + + punpcklwd xmm0,xmm3 + pmaddwd xmm0,[GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2 + + movdqa xmm7,xmm4 + paddd xmm4,xmm0 ; xmm4=tmp10 + psubd xmm7,xmm0 ; xmm7=tmp12 + + ; -- Final output stage + + movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4] + + movdqa xmm5,xmm4 + movdqa xmm3,xmm7 + paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30) + paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31) + psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33) + psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32) + + paddd xmm4,xmm1 + paddd xmm7,xmm1 + psrad xmm4,DESCALE_P2_4 + psrad xmm7,DESCALE_P2_4 + paddd xmm5,xmm1 + paddd xmm3,xmm1 + psrad xmm5,DESCALE_P2_4 + psrad xmm3,DESCALE_P2_4 + + packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32) + packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33) + + movdqa xmm0,xmm4 ; transpose coefficients(phase 1) + punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31) + punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33) + + movdqa xmm6,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13) + punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33) + + packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..) + paddb xmm4,[GOTOFF(ebx,PB_CENTERJSAMP)] + + pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..) + pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..) + pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 + movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 + mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 + movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + + +; -------------------------------------------------------------------------- +; +; Perform dequantization and inverse DCT on one block of coefficients, +; producing a reduced-size 2x2 output block. +; +; GLOBAL(void) +; jsimd_idct_2x2_sse2 (void *dct_table, JCOEFPTR coef_block, +; JSAMPARRAY output_buf, JDIMENSION output_col) +; + +%define dct_table(b) (b)+8 ; void *dct_table +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col + + align 16 + global EXTN(jsimd_idct_2x2_sse2) + +EXTN(jsimd_idct_2x2_sse2): + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + + mov edx, POINTER [dct_table(ebp)] ; quantptr + mov esi, JCOEFPTR [coef_block(ebp)] ; inptr + + ; | input: | result: | + ; | 00 01 ** 03 ** 05 ** 07 | | + ; | 10 11 ** 13 ** 15 ** 17 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | + ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | + ; | 50 51 ** 53 ** 55 ** 57 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 70 71 ** 73 ** 75 ** 77 | | + + ; -- Odd part + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37) + ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77) + + pcmpeqd xmm7,xmm7 + pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..} + + movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17) + movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57) + punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33) + punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73) + pmaddwd xmm4,[GOTOFF(ebx,PW_F362_MF127)] + pmaddwd xmm5,[GOTOFF(ebx,PW_F085_MF072)] + + psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --) + pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37) + psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --) + pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77) + por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37) + por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77) + pmaddwd xmm0,[GOTOFF(ebx,PW_F362_MF127)] + pmaddwd xmm2,[GOTOFF(ebx,PW_F085_MF072)] + + paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3] + paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7] + + ; -- Even part + + movdqa xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; xmm6=(00 01 ** 03 ** 05 ** 07) + + movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07) + pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **) + pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07) + psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****] + psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7] + + ; -- Final output stage + + movdqa xmm3,xmm6 + movdqa xmm5,xmm1 + paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **) + paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7) + psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **) + psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7) + + movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2] + + punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **) + + movdqa xmm7,xmm1 + punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3) + punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7) + + paddd xmm6,xmm2 + psrad xmm6,DESCALE_P1_2 + + paddd xmm1,xmm2 + paddd xmm7,xmm2 + psrad xmm1,DESCALE_P1_2 + psrad xmm7,DESCALE_P1_2 + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows, store into output array. + + mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(ebp)] + + ; | input:| result:| + ; | A0 B0 | | + ; | A1 B1 | C0 C1 | + ; | A3 B3 | D0 D1 | + ; | A5 B5 | | + ; | A7 B7 | | + + ; -- Odd part + + packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3) + packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7) + pmaddwd xmm1,[GOTOFF(ebx,PW_F362_MF127)] + pmaddwd xmm7,[GOTOFF(ebx,PW_F085_MF072)] + + paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1] + + ; -- Even part + + pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****] + + ; -- Final output stage + + movdqa xmm4,xmm6 + paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **) + psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **) + + punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1) + + paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2_2)] + psrad xmm6,DESCALE_P2_2 + + packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1) + packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..) + paddb xmm6,[GOTOFF(ebx,PB_CENTERJSAMP)] + + pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --) + pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov WORD [edx+eax*SIZEOF_JSAMPLE], bx + mov WORD [esi+eax*SIZEOF_JSAMPLE], cx + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jpeg_nbits_table.inc b/media/libjpeg/simd/jpeg_nbits_table.inc new file mode 100644 index 000000000..cbc69904e --- /dev/null +++ b/media/libjpeg/simd/jpeg_nbits_table.inc @@ -0,0 +1,4097 @@ +jpeg_nbits_table db \ + 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, \ + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, \ + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, \ + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, \ + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, \ + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, \ + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, \ + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, \ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 diff --git a/media/libjpeg/simd/jquant-3dn.asm b/media/libjpeg/simd/jquant-3dn.asm new file mode 100644 index 000000000..0b4164b26 --- /dev/null +++ b/media/libjpeg/simd/jquant-3dn.asm @@ -0,0 +1,232 @@ +; +; jquant.asm - sample data conversion and quantization (3DNow! & MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_float_3dnow (JSAMPARRAY sample_data, JDIMENSION start_col, +; FAST_FLOAT *workspace); +; + +%define sample_data ebp+8 ; JSAMPARRAY sample_data +%define start_col ebp+12 ; JDIMENSION start_col +%define workspace ebp+16 ; FAST_FLOAT *workspace + + align 16 + global EXTN(jsimd_convsamp_float_3dnow) + +EXTN(jsimd_convsamp_float_3dnow): + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pcmpeqw mm7,mm7 + psllw mm7,7 + packsswb mm7,mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..) + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/2 + alignx 16,7 +.convloop: + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] + + psubb mm0,mm7 ; mm0=(01234567) + psubb mm1,mm7 ; mm1=(89ABCDEF) + + punpcklbw mm2,mm0 ; mm2=(*0*1*2*3) + punpckhbw mm0,mm0 ; mm0=(*4*5*6*7) + punpcklbw mm3,mm1 ; mm3=(*8*9*A*B) + punpckhbw mm1,mm1 ; mm1=(*C*D*E*F) + + punpcklwd mm4,mm2 ; mm4=(***0***1) + punpckhwd mm2,mm2 ; mm2=(***2***3) + punpcklwd mm5,mm0 ; mm5=(***4***5) + punpckhwd mm0,mm0 ; mm0=(***6***7) + + psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(01) + psrad mm2,(DWORD_BIT-BYTE_BIT) ; mm2=(23) + pi2fd mm4,mm4 + pi2fd mm2,mm2 + psrad mm5,(DWORD_BIT-BYTE_BIT) ; mm5=(45) + psrad mm0,(DWORD_BIT-BYTE_BIT) ; mm0=(67) + pi2fd mm5,mm5 + pi2fd mm0,mm0 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2 + movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5 + movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 + + punpcklwd mm6,mm3 ; mm6=(***8***9) + punpckhwd mm3,mm3 ; mm3=(***A***B) + punpcklwd mm4,mm1 ; mm4=(***C***D) + punpckhwd mm1,mm1 ; mm1=(***E***F) + + psrad mm6,(DWORD_BIT-BYTE_BIT) ; mm6=(89) + psrad mm3,(DWORD_BIT-BYTE_BIT) ; mm3=(AB) + pi2fd mm6,mm6 + pi2fd mm3,mm3 + psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(CD) + psrad mm1,(DWORD_BIT-BYTE_BIT) ; mm1=(EF) + pi2fd mm4,mm4 + pi2fd mm1,mm1 + + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3 + movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1 + + add esi, byte 2*SIZEOF_JSAMPROW + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz near .convloop + + femms ; empty MMX/3DNow! state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; GLOBAL(void) +; jsimd_quantize_float_3dnow (JCOEFPTR coef_block, FAST_FLOAT *divisors, +; FAST_FLOAT *workspace); +; + +%define coef_block ebp+8 ; JCOEFPTR coef_block +%define divisors ebp+12 ; FAST_FLOAT *divisors +%define workspace ebp+16 ; FAST_FLOAT *workspace + + align 16 + global EXTN(jsimd_quantize_float_3dnow) + +EXTN(jsimd_quantize_float_3dnow): + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov eax, 0x4B400000 ; (float)0x00C00000 (rndint_magic) + movd mm7,eax + punpckldq mm7,mm7 ; mm7={12582912.0F 12582912.0F} + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov eax, DCTSIZE2/16 + alignx 16,7 +.quantloop: + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] + pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + pfmul mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)] + pfmul mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)] + pfmul mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)] + + pfadd mm0,mm7 ; mm0=(00 ** 01 **) + pfadd mm1,mm7 ; mm1=(02 ** 03 **) + pfadd mm2,mm7 ; mm0=(04 ** 05 **) + pfadd mm3,mm7 ; mm1=(06 ** 07 **) + + movq mm4,mm0 + punpcklwd mm0,mm1 ; mm0=(00 02 ** **) + punpckhwd mm4,mm1 ; mm4=(01 03 ** **) + movq mm5,mm2 + punpcklwd mm2,mm3 ; mm2=(04 06 ** **) + punpckhwd mm5,mm3 ; mm5=(05 07 ** **) + + punpcklwd mm0,mm4 ; mm0=(00 01 02 03) + punpcklwd mm2,mm5 ; mm2=(04 05 06 07) + + movq mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] + pfmul mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + pfmul mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)] + movq mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)] + pfmul mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)] + pfmul mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)] + + pfadd mm6,mm7 ; mm0=(10 ** 11 **) + pfadd mm1,mm7 ; mm4=(12 ** 13 **) + pfadd mm3,mm7 ; mm0=(14 ** 15 **) + pfadd mm4,mm7 ; mm4=(16 ** 17 **) + + movq mm5,mm6 + punpcklwd mm6,mm1 ; mm6=(10 12 ** **) + punpckhwd mm5,mm1 ; mm5=(11 13 ** **) + movq mm1,mm3 + punpcklwd mm3,mm4 ; mm3=(14 16 ** **) + punpckhwd mm1,mm4 ; mm1=(15 17 ** **) + + punpcklwd mm6,mm5 ; mm6=(10 11 12 13) + punpcklwd mm3,mm1 ; mm3=(14 15 16 17) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3 + + add esi, byte 16*SIZEOF_FAST_FLOAT + add edx, byte 16*SIZEOF_FAST_FLOAT + add edi, byte 16*SIZEOF_JCOEF + dec eax + jnz near .quantloop + + femms ; empty MMX/3DNow! state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jquant-mmx.asm b/media/libjpeg/simd/jquant-mmx.asm new file mode 100644 index 000000000..aed6071bf --- /dev/null +++ b/media/libjpeg/simd/jquant-mmx.asm @@ -0,0 +1,273 @@ +; +; jquant.asm - sample data conversion and quantization (MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_mmx (JSAMPARRAY sample_data, JDIMENSION start_col, +; DCTELEM *workspace); +; + +%define sample_data ebp+8 ; JSAMPARRAY sample_data +%define start_col ebp+12 ; JDIMENSION start_col +%define workspace ebp+16 ; DCTELEM *workspace + + align 16 + global EXTN(jsimd_convsamp_mmx) + +EXTN(jsimd_convsamp_mmx): + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pxor mm6,mm6 ; mm6=(all 0's) + pcmpeqw mm7,mm7 + psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80} + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16,7 +.convloop: + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm0=(01234567) + movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm1=(89ABCDEF) + + mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm2=(GHIJKLMN) + movq mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm3=(OPQRSTUV) + + movq mm4,mm0 + punpcklbw mm0,mm6 ; mm0=(0123) + punpckhbw mm4,mm6 ; mm4=(4567) + movq mm5,mm1 + punpcklbw mm1,mm6 ; mm1=(89AB) + punpckhbw mm5,mm6 ; mm5=(CDEF) + + paddw mm0,mm7 + paddw mm4,mm7 + paddw mm1,mm7 + paddw mm5,mm7 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5 + + movq mm0,mm2 + punpcklbw mm2,mm6 ; mm2=(GHIJ) + punpckhbw mm0,mm6 ; mm0=(KLMN) + movq mm4,mm3 + punpcklbw mm3,mm6 ; mm3=(OPQR) + punpckhbw mm4,mm6 ; mm4=(STUV) + + paddw mm2,mm7 + paddw mm0,mm7 + paddw mm3,mm7 + paddw mm4,mm7 + + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4 + + add esi, byte 4*SIZEOF_JSAMPROW + add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec ecx + jnz short .convloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; This implementation is based on an algorithm described in +; "How to optimize for the Pentium family of microprocessors" +; (http://www.agner.org/assem/). +; +; GLOBAL(void) +; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM *divisors, +; DCTELEM *workspace); +; + +%define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) +%define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) +%define SCALE(m,n,b) MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) +%define SHIFT(m,n,b) MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM) + +%define coef_block ebp+8 ; JCOEFPTR coef_block +%define divisors ebp+12 ; DCTELEM *divisors +%define workspace ebp+16 ; DCTELEM *workspace + + align 16 + global EXTN(jsimd_quantize_mmx) + +EXTN(jsimd_quantize_mmx): + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov ah, 2 + alignx 16,7 +.quantloop1: + mov al, DCTSIZE2/8/2 + alignx 16,7 +.quantloop2: + movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)] + + movq mm0,mm2 + movq mm1,mm3 + + psraw mm2,(WORD_BIT-1) ; -1 if value < 0, 0 otherwise + psraw mm3,(WORD_BIT-1) + + pxor mm0,mm2 ; val = -val + pxor mm1,mm3 + psubw mm0,mm2 + psubw mm1,mm3 + + ; + ; MMX is an annoyingly crappy instruction set. It has two + ; misfeatures that are causing problems here: + ; + ; - All multiplications are signed. + ; + ; - The second operand for the shifts is not treated as packed. + ; + ; + ; We work around the first problem by implementing this algorithm: + ; + ; unsigned long unsigned_multiply(unsigned short x, unsigned short y) + ; { + ; enum { SHORT_BIT = 16 }; + ; signed short sx = (signed short) x; + ; signed short sy = (signed short) y; + ; signed long sz; + ; + ; sz = (long) sx * (long) sy; /* signed multiply */ + ; + ; if (sx < 0) sz += (long) sy << SHORT_BIT; + ; if (sy < 0) sz += (long) sx << SHORT_BIT; + ; + ; return (unsigned long) sz; + ; } + ; + ; (note that a negative sx adds _sy_ and vice versa) + ; + ; For the second problem, we replace the shift by a multiplication. + ; Unfortunately that means we have to deal with the signed issue again. + ; + + paddw mm0, MMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor + paddw mm1, MMWORD [CORRECTION(0,1,edx)] + + movq mm4,mm0 ; store current value for later + movq mm5,mm1 + pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)] ; reciprocal + pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)] + paddw mm0,mm4 ; reciprocal is always negative (MSB=1), + paddw mm1,mm5 ; so we always need to add the initial value + ; (input value is never negative as we + ; inverted it at the start of this routine) + + ; here it gets a bit tricky as both scale + ; and mm0/mm1 can be negative + movq mm6, MMWORD [SCALE(0,0,edx)] ; scale + movq mm7, MMWORD [SCALE(0,1,edx)] + movq mm4,mm0 + movq mm5,mm1 + pmulhw mm0,mm6 + pmulhw mm1,mm7 + + psraw mm6,(WORD_BIT-1) ; determine if scale is negative + psraw mm7,(WORD_BIT-1) + + pand mm6,mm4 ; and add input if it is + pand mm7,mm5 + paddw mm0,mm6 + paddw mm1,mm7 + + psraw mm4,(WORD_BIT-1) ; then check if negative input + psraw mm5,(WORD_BIT-1) + + pand mm4, MMWORD [SCALE(0,0,edx)] ; and add scale if it is + pand mm5, MMWORD [SCALE(0,1,edx)] + paddw mm0,mm4 + paddw mm1,mm5 + + pxor mm0,mm2 ; val = -val + pxor mm1,mm3 + psubw mm0,mm2 + psubw mm1,mm3 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1 + + add esi, byte 8*SIZEOF_DCTELEM + add edx, byte 8*SIZEOF_DCTELEM + add edi, byte 8*SIZEOF_JCOEF + dec al + jnz near .quantloop2 + dec ah + jnz near .quantloop1 ; to avoid branch misprediction + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jquant-sse.asm b/media/libjpeg/simd/jquant-sse.asm new file mode 100644 index 000000000..1baf88f25 --- /dev/null +++ b/media/libjpeg/simd/jquant-sse.asm @@ -0,0 +1,210 @@ +; +; jquant.asm - sample data conversion and quantization (SSE & MMX) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col, +; FAST_FLOAT *workspace); +; + +%define sample_data ebp+8 ; JSAMPARRAY sample_data +%define start_col ebp+12 ; JDIMENSION start_col +%define workspace ebp+16 ; FAST_FLOAT *workspace + + align 16 + global EXTN(jsimd_convsamp_float_sse) + +EXTN(jsimd_convsamp_float_sse): + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pcmpeqw mm7,mm7 + psllw mm7,7 + packsswb mm7,mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..) + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/2 + alignx 16,7 +.convloop: + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] + + psubb mm0,mm7 ; mm0=(01234567) + psubb mm1,mm7 ; mm1=(89ABCDEF) + + punpcklbw mm2,mm0 ; mm2=(*0*1*2*3) + punpckhbw mm0,mm0 ; mm0=(*4*5*6*7) + punpcklbw mm3,mm1 ; mm3=(*8*9*A*B) + punpckhbw mm1,mm1 ; mm1=(*C*D*E*F) + + punpcklwd mm4,mm2 ; mm4=(***0***1) + punpckhwd mm2,mm2 ; mm2=(***2***3) + punpcklwd mm5,mm0 ; mm5=(***4***5) + punpckhwd mm0,mm0 ; mm0=(***6***7) + + psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(01) + psrad mm2,(DWORD_BIT-BYTE_BIT) ; mm2=(23) + cvtpi2ps xmm0,mm4 ; xmm0=(01**) + cvtpi2ps xmm1,mm2 ; xmm1=(23**) + psrad mm5,(DWORD_BIT-BYTE_BIT) ; mm5=(45) + psrad mm0,(DWORD_BIT-BYTE_BIT) ; mm0=(67) + cvtpi2ps xmm2,mm5 ; xmm2=(45**) + cvtpi2ps xmm3,mm0 ; xmm3=(67**) + + punpcklwd mm6,mm3 ; mm6=(***8***9) + punpckhwd mm3,mm3 ; mm3=(***A***B) + punpcklwd mm4,mm1 ; mm4=(***C***D) + punpckhwd mm1,mm1 ; mm1=(***E***F) + + psrad mm6,(DWORD_BIT-BYTE_BIT) ; mm6=(89) + psrad mm3,(DWORD_BIT-BYTE_BIT) ; mm3=(AB) + cvtpi2ps xmm4,mm6 ; xmm4=(89**) + cvtpi2ps xmm5,mm3 ; xmm5=(AB**) + psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(CD) + psrad mm1,(DWORD_BIT-BYTE_BIT) ; mm1=(EF) + cvtpi2ps xmm6,mm4 ; xmm6=(CD**) + cvtpi2ps xmm7,mm1 ; xmm7=(EF**) + + movlhps xmm0,xmm1 ; xmm0=(0123) + movlhps xmm2,xmm3 ; xmm2=(4567) + movlhps xmm4,xmm5 ; xmm4=(89AB) + movlhps xmm6,xmm7 ; xmm6=(CDEF) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 + + add esi, byte 2*SIZEOF_JSAMPROW + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz near .convloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; GLOBAL(void) +; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT *divisors, +; FAST_FLOAT *workspace); +; + +%define coef_block ebp+8 ; JCOEFPTR coef_block +%define divisors ebp+12 ; FAST_FLOAT *divisors +%define workspace ebp+16 ; FAST_FLOAT *workspace + + align 16 + global EXTN(jsimd_quantize_float_sse) + +EXTN(jsimd_quantize_float_sse): + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov eax, DCTSIZE2/16 + alignx 16,7 +.quantloop: + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + + movhlps xmm4,xmm0 + movhlps xmm5,xmm1 + + cvtps2pi mm0,xmm0 + cvtps2pi mm1,xmm1 + cvtps2pi mm4,xmm4 + cvtps2pi mm5,xmm5 + + movhlps xmm6,xmm2 + movhlps xmm7,xmm3 + + cvtps2pi mm2,xmm2 + cvtps2pi mm3,xmm3 + cvtps2pi mm6,xmm6 + cvtps2pi mm7,xmm7 + + packssdw mm0,mm4 + packssdw mm1,mm5 + packssdw mm2,mm6 + packssdw mm3,mm7 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3 + + add esi, byte 16*SIZEOF_FAST_FLOAT + add edx, byte 16*SIZEOF_FAST_FLOAT + add edi, byte 16*SIZEOF_JCOEF + dec eax + jnz short .quantloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jquantf-sse2-64.asm b/media/libjpeg/simd/jquantf-sse2-64.asm new file mode 100644 index 000000000..ef5c1f959 --- /dev/null +++ b/media/libjpeg/simd/jquantf-sse2-64.asm @@ -0,0 +1,157 @@ +; +; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col, +; FAST_FLOAT *workspace); +; + +; r10 = JSAMPARRAY sample_data +; r11 = JDIMENSION start_col +; r12 = FAST_FLOAT *workspace + + align 16 + global EXTN(jsimd_convsamp_float_sse2) + +EXTN(jsimd_convsamp_float_sse2): + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + push rbx + + pcmpeqw xmm7,xmm7 + psllw xmm7,7 + packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) + + mov rsi, r10 + mov eax, r11d + mov rdi, r12 + mov rcx, DCTSIZE/2 +.convloop: + mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] + movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] + + psubb xmm0,xmm7 ; xmm0=(01234567) + psubb xmm1,xmm7 ; xmm1=(89ABCDEF) + + punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) + punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) + + punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3) + punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7) + punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B) + punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F) + + psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123) + psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567) + cvtdq2ps xmm2,xmm2 ; xmm2=(0123) + cvtdq2ps xmm0,xmm0 ; xmm0=(4567) + psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) + psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) + cvtdq2ps xmm3,xmm3 ; xmm3=(89AB) + cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF) + + movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 + + add rsi, byte 2*SIZEOF_JSAMPROW + add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec rcx + jnz short .convloop + + pop rbx + uncollect_args + pop rbp + ret + + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; GLOBAL(void) +; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT *divisors, +; FAST_FLOAT *workspace); +; + +; r10 = JCOEFPTR coef_block +; r11 = FAST_FLOAT *divisors +; r12 = FAST_FLOAT *workspace + + align 16 + global EXTN(jsimd_quantize_float_sse2) + +EXTN(jsimd_quantize_float_sse2): + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + + mov rsi, r12 + mov rdx, r11 + mov rdi, r10 + mov rax, DCTSIZE2/16 +.quantloop: + movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)] + mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] + mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)] + mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] + mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] + + cvtps2dq xmm0,xmm0 + cvtps2dq xmm1,xmm1 + cvtps2dq xmm2,xmm2 + cvtps2dq xmm3,xmm3 + + packssdw xmm0,xmm1 + packssdw xmm2,xmm3 + + movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2 + + add rsi, byte 16*SIZEOF_FAST_FLOAT + add rdx, byte 16*SIZEOF_FAST_FLOAT + add rdi, byte 16*SIZEOF_JCOEF + dec rax + jnz short .quantloop + + uncollect_args + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jquantf-sse2.asm b/media/libjpeg/simd/jquantf-sse2.asm new file mode 100644 index 000000000..1cbc26740 --- /dev/null +++ b/media/libjpeg/simd/jquantf-sse2.asm @@ -0,0 +1,170 @@ +; +; jquantf.asm - sample data conversion and quantization (SSE & SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col, +; FAST_FLOAT *workspace); +; + +%define sample_data ebp+8 ; JSAMPARRAY sample_data +%define start_col ebp+12 ; JDIMENSION start_col +%define workspace ebp+16 ; FAST_FLOAT *workspace + + align 16 + global EXTN(jsimd_convsamp_float_sse2) + +EXTN(jsimd_convsamp_float_sse2): + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pcmpeqw xmm7,xmm7 + psllw xmm7,7 + packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/2 + alignx 16,7 +.convloop: + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] + + psubb xmm0,xmm7 ; xmm0=(01234567) + psubb xmm1,xmm7 ; xmm1=(89ABCDEF) + + punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) + punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) + + punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3) + punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7) + punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B) + punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F) + + psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123) + psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567) + cvtdq2ps xmm2,xmm2 ; xmm2=(0123) + cvtdq2ps xmm0,xmm0 ; xmm0=(4567) + psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) + psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) + cvtdq2ps xmm3,xmm3 ; xmm3=(89AB) + cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 + + add esi, byte 2*SIZEOF_JSAMPROW + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz short .convloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; GLOBAL(void) +; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT *divisors, +; FAST_FLOAT *workspace); +; + +%define coef_block ebp+8 ; JCOEFPTR coef_block +%define divisors ebp+12 ; FAST_FLOAT *divisors +%define workspace ebp+16 ; FAST_FLOAT *workspace + + align 16 + global EXTN(jsimd_quantize_float_sse2) + +EXTN(jsimd_quantize_float_sse2): + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov eax, DCTSIZE2/16 + alignx 16,7 +.quantloop: + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + + cvtps2dq xmm0,xmm0 + cvtps2dq xmm1,xmm1 + cvtps2dq xmm2,xmm2 + cvtps2dq xmm3,xmm3 + + packssdw xmm0,xmm1 + packssdw xmm2,xmm3 + + movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2 + + add esi, byte 16*SIZEOF_FAST_FLOAT + add edx, byte 16*SIZEOF_FAST_FLOAT + add edi, byte 16*SIZEOF_JCOEF + dec eax + jnz short .quantloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jquanti-altivec.c b/media/libjpeg/simd/jquanti-altivec.c new file mode 100644 index 000000000..25cc296f7 --- /dev/null +++ b/media/libjpeg/simd/jquanti-altivec.c @@ -0,0 +1,252 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */ + +#include "jsimd_altivec.h" + + +/* NOTE: The address will either be aligned or offset by 8 bytes, so we can + * always get the data we want by using a single vector load (although we may + * have to permute the result.) + */ +#if __BIG_ENDIAN__ + +#define LOAD_ROW(row) { \ + elemptr = sample_data[row] + start_col; \ + in##row = vec_ld(0, elemptr); \ + if ((size_t)elemptr & 15) \ + in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \ +} + +#else + +#define LOAD_ROW(row) { \ + elemptr = sample_data[row] + start_col; \ + in##row = vec_vsx_ld(0, elemptr); \ +} + +#endif + + +void +jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + JSAMPROW elemptr; + + __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7; + __vector short out0, out1, out2, out3, out4, out5, out6, out7; + + /* Constants */ + __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) }; + __vector unsigned char pb_zero = { __16X(0) }; + + LOAD_ROW(0); + LOAD_ROW(1); + LOAD_ROW(2); + LOAD_ROW(3); + LOAD_ROW(4); + LOAD_ROW(5); + LOAD_ROW(6); + LOAD_ROW(7); + + out0 = (__vector short)VEC_UNPACKHU(in0); + out1 = (__vector short)VEC_UNPACKHU(in1); + out2 = (__vector short)VEC_UNPACKHU(in2); + out3 = (__vector short)VEC_UNPACKHU(in3); + out4 = (__vector short)VEC_UNPACKHU(in4); + out5 = (__vector short)VEC_UNPACKHU(in5); + out6 = (__vector short)VEC_UNPACKHU(in6); + out7 = (__vector short)VEC_UNPACKHU(in7); + + out0 = vec_sub(out0, pw_centerjsamp); + out1 = vec_sub(out1, pw_centerjsamp); + out2 = vec_sub(out2, pw_centerjsamp); + out3 = vec_sub(out3, pw_centerjsamp); + out4 = vec_sub(out4, pw_centerjsamp); + out5 = vec_sub(out5, pw_centerjsamp); + out6 = vec_sub(out6, pw_centerjsamp); + out7 = vec_sub(out7, pw_centerjsamp); + + vec_st(out0, 0, workspace); + vec_st(out1, 16, workspace); + vec_st(out2, 32, workspace); + vec_st(out3, 48, workspace); + vec_st(out4, 64, workspace); + vec_st(out5, 80, workspace); + vec_st(out6, 96, workspace); + vec_st(out7, 112, workspace); +} + + +#define WORD_BIT 16 + +/* There is no AltiVec 16-bit unsigned multiply instruction, hence this. + We basically need an unsigned equivalent of vec_madds(). */ + +#define MULTIPLY(vs0, vs1, out) { \ + tmpe = vec_mule((__vector unsigned short)vs0, \ + (__vector unsigned short)vs1); \ + tmpo = vec_mulo((__vector unsigned short)vs0, \ + (__vector unsigned short)vs1); \ + out = (__vector short)vec_perm((__vector unsigned short)tmpe, \ + (__vector unsigned short)tmpo, \ + shift_pack_index); \ +} + +void +jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM *divisors, + DCTELEM *workspace) +{ + __vector short row0, row1, row2, row3, row4, row5, row6, row7, + row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s, + corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7, + recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7, + scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7; + __vector unsigned int tmpe, tmpo; + + /* Constants */ + __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) }; +#if __BIG_ENDIAN__ + __vector unsigned char shift_pack_index = + {0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29}; +#else + __vector unsigned char shift_pack_index = + {2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31}; +#endif + + row0 = vec_ld(0, workspace); + row1 = vec_ld(16, workspace); + row2 = vec_ld(32, workspace); + row3 = vec_ld(48, workspace); + row4 = vec_ld(64, workspace); + row5 = vec_ld(80, workspace); + row6 = vec_ld(96, workspace); + row7 = vec_ld(112, workspace); + + /* Branch-less absolute value */ + row0s = vec_sra(row0, pw_word_bit_m1); + row1s = vec_sra(row1, pw_word_bit_m1); + row2s = vec_sra(row2, pw_word_bit_m1); + row3s = vec_sra(row3, pw_word_bit_m1); + row4s = vec_sra(row4, pw_word_bit_m1); + row5s = vec_sra(row5, pw_word_bit_m1); + row6s = vec_sra(row6, pw_word_bit_m1); + row7s = vec_sra(row7, pw_word_bit_m1); + row0 = vec_xor(row0, row0s); + row1 = vec_xor(row1, row1s); + row2 = vec_xor(row2, row2s); + row3 = vec_xor(row3, row3s); + row4 = vec_xor(row4, row4s); + row5 = vec_xor(row5, row5s); + row6 = vec_xor(row6, row6s); + row7 = vec_xor(row7, row7s); + row0 = vec_sub(row0, row0s); + row1 = vec_sub(row1, row1s); + row2 = vec_sub(row2, row2s); + row3 = vec_sub(row3, row3s); + row4 = vec_sub(row4, row4s); + row5 = vec_sub(row5, row5s); + row6 = vec_sub(row6, row6s); + row7 = vec_sub(row7, row7s); + + corr0 = vec_ld(DCTSIZE2 * 2, divisors); + corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors); + corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors); + corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors); + corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors); + corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors); + corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors); + corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors); + + row0 = vec_add(row0, corr0); + row1 = vec_add(row1, corr1); + row2 = vec_add(row2, corr2); + row3 = vec_add(row3, corr3); + row4 = vec_add(row4, corr4); + row5 = vec_add(row5, corr5); + row6 = vec_add(row6, corr6); + row7 = vec_add(row7, corr7); + + recip0 = vec_ld(0, divisors); + recip1 = vec_ld(16, divisors); + recip2 = vec_ld(32, divisors); + recip3 = vec_ld(48, divisors); + recip4 = vec_ld(64, divisors); + recip5 = vec_ld(80, divisors); + recip6 = vec_ld(96, divisors); + recip7 = vec_ld(112, divisors); + + MULTIPLY(row0, recip0, row0); + MULTIPLY(row1, recip1, row1); + MULTIPLY(row2, recip2, row2); + MULTIPLY(row3, recip3, row3); + MULTIPLY(row4, recip4, row4); + MULTIPLY(row5, recip5, row5); + MULTIPLY(row6, recip6, row6); + MULTIPLY(row7, recip7, row7); + + scale0 = vec_ld(DCTSIZE2 * 4, divisors); + scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors); + scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors); + scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors); + scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors); + scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors); + scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors); + scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors); + + MULTIPLY(row0, scale0, row0); + MULTIPLY(row1, scale1, row1); + MULTIPLY(row2, scale2, row2); + MULTIPLY(row3, scale3, row3); + MULTIPLY(row4, scale4, row4); + MULTIPLY(row5, scale5, row5); + MULTIPLY(row6, scale6, row6); + MULTIPLY(row7, scale7, row7); + + row0 = vec_xor(row0, row0s); + row1 = vec_xor(row1, row1s); + row2 = vec_xor(row2, row2s); + row3 = vec_xor(row3, row3s); + row4 = vec_xor(row4, row4s); + row5 = vec_xor(row5, row5s); + row6 = vec_xor(row6, row6s); + row7 = vec_xor(row7, row7s); + row0 = vec_sub(row0, row0s); + row1 = vec_sub(row1, row1s); + row2 = vec_sub(row2, row2s); + row3 = vec_sub(row3, row3s); + row4 = vec_sub(row4, row4s); + row5 = vec_sub(row5, row5s); + row6 = vec_sub(row6, row6s); + row7 = vec_sub(row7, row7s); + + vec_st(row0, 0, coef_block); + vec_st(row1, 16, coef_block); + vec_st(row2, 32, coef_block); + vec_st(row3, 48, coef_block); + vec_st(row4, 64, coef_block); + vec_st(row5, 80, coef_block); + vec_st(row6, 96, coef_block); + vec_st(row7, 112, coef_block); +} diff --git a/media/libjpeg/simd/jquanti-sse2-64.asm b/media/libjpeg/simd/jquanti-sse2-64.asm new file mode 100644 index 000000000..66c4e5190 --- /dev/null +++ b/media/libjpeg/simd/jquanti-sse2-64.asm @@ -0,0 +1,186 @@ +; +; jquanti.asm - sample data conversion and quantization (64-bit SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2009, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col, +; DCTELEM *workspace); +; + +; r10 = JSAMPARRAY sample_data +; r11 = JDIMENSION start_col +; r12 = DCTELEM *workspace + + align 16 + global EXTN(jsimd_convsamp_sse2) + +EXTN(jsimd_convsamp_sse2): + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + push rbx + + pxor xmm6,xmm6 ; xmm6=(all 0's) + pcmpeqw xmm7,xmm7 + psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + mov rsi, r10 + mov eax, r11d + mov rdi, r12 + mov rcx, DCTSIZE/4 +.convloop: + mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567) + movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) + + mov rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) + movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) + + punpcklbw xmm0,xmm6 ; xmm0=(01234567) + punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF) + paddw xmm0,xmm7 + paddw xmm1,xmm7 + punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN) + punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV) + paddw xmm2,xmm7 + paddw xmm3,xmm7 + + movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 + + add rsi, byte 4*SIZEOF_JSAMPROW + add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec rcx + jnz short .convloop + + pop rbx + uncollect_args + pop rbp + ret + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; This implementation is based on an algorithm described in +; "How to optimize for the Pentium family of microprocessors" +; (http://www.agner.org/assem/). +; +; GLOBAL(void) +; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM *divisors, +; DCTELEM *workspace); +; + +%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) +%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) +%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) + +; r10 = JCOEFPTR coef_block +; r11 = DCTELEM *divisors +; r12 = DCTELEM *workspace + + align 16 + global EXTN(jsimd_quantize_sse2) + +EXTN(jsimd_quantize_sse2): + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + + mov rsi, r12 + mov rdx, r11 + mov rdi, r10 + mov rax, DCTSIZE2/32 +.quantloop: + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)] + movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)] + movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)] + movdqa xmm0,xmm4 + movdqa xmm1,xmm5 + movdqa xmm2,xmm6 + movdqa xmm3,xmm7 + psraw xmm4,(WORD_BIT-1) + psraw xmm5,(WORD_BIT-1) + psraw xmm6,(WORD_BIT-1) + psraw xmm7,(WORD_BIT-1) + pxor xmm0,xmm4 + pxor xmm1,xmm5 + pxor xmm2,xmm6 + pxor xmm3,xmm7 + psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; + psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; + psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; + psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; + + paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor + paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)] + paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)] + paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)] + pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal + pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)] + pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)] + pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)] + pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale + pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)] + pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)] + pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)] + + pxor xmm0,xmm4 + pxor xmm1,xmm5 + pxor xmm2,xmm6 + pxor xmm3,xmm7 + psubw xmm0,xmm4 + psubw xmm1,xmm5 + psubw xmm2,xmm6 + psubw xmm3,xmm7 + movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 + + add rsi, byte 32*SIZEOF_DCTELEM + add rdx, byte 32*SIZEOF_DCTELEM + add rdi, byte 32*SIZEOF_JCOEF + dec rax + jnz near .quantloop + + uncollect_args + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jquanti-sse2.asm b/media/libjpeg/simd/jquanti-sse2.asm new file mode 100644 index 000000000..aea8604e2 --- /dev/null +++ b/media/libjpeg/simd/jquanti-sse2.asm @@ -0,0 +1,199 @@ +; +; jquanti.asm - sample data conversion and quantization (SSE2) +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col, +; DCTELEM *workspace); +; + +%define sample_data ebp+8 ; JSAMPARRAY sample_data +%define start_col ebp+12 ; JDIMENSION start_col +%define workspace ebp+16 ; DCTELEM *workspace + + align 16 + global EXTN(jsimd_convsamp_sse2) + +EXTN(jsimd_convsamp_sse2): + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pxor xmm6,xmm6 ; xmm6=(all 0's) + pcmpeqw xmm7,xmm7 + psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16,7 +.convloop: + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567) + movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) + + mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) + movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) + + punpcklbw xmm0,xmm6 ; xmm0=(01234567) + punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF) + paddw xmm0,xmm7 + paddw xmm1,xmm7 + punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN) + punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV) + paddw xmm2,xmm7 + paddw xmm3,xmm7 + + movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 + + add esi, byte 4*SIZEOF_JSAMPROW + add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec ecx + jnz short .convloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; This implementation is based on an algorithm described in +; "How to optimize for the Pentium family of microprocessors" +; (http://www.agner.org/assem/). +; +; GLOBAL(void) +; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM *divisors, +; DCTELEM *workspace); +; + +%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) +%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) +%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) + +%define coef_block ebp+8 ; JCOEFPTR coef_block +%define divisors ebp+12 ; DCTELEM *divisors +%define workspace ebp+16 ; DCTELEM *workspace + + align 16 + global EXTN(jsimd_quantize_sse2) + +EXTN(jsimd_quantize_sse2): + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov eax, DCTSIZE2/32 + alignx 16,7 +.quantloop: + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] + movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)] + movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)] + movdqa xmm0,xmm4 + movdqa xmm1,xmm5 + movdqa xmm2,xmm6 + movdqa xmm3,xmm7 + psraw xmm4,(WORD_BIT-1) + psraw xmm5,(WORD_BIT-1) + psraw xmm6,(WORD_BIT-1) + psraw xmm7,(WORD_BIT-1) + pxor xmm0,xmm4 + pxor xmm1,xmm5 + pxor xmm2,xmm6 + pxor xmm3,xmm7 + psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; + psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; + psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; + psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; + + paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor + paddw xmm1, XMMWORD [CORRECTION(1,0,edx)] + paddw xmm2, XMMWORD [CORRECTION(2,0,edx)] + paddw xmm3, XMMWORD [CORRECTION(3,0,edx)] + pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal + pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)] + pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)] + pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)] + pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale + pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)] + pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)] + pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)] + + pxor xmm0,xmm4 + pxor xmm1,xmm5 + pxor xmm2,xmm6 + pxor xmm3,xmm7 + psubw xmm0,xmm4 + psubw xmm1,xmm5 + psubw xmm2,xmm6 + psubw xmm3,xmm7 + movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 + + add esi, byte 32*SIZEOF_DCTELEM + add edx, byte 32*SIZEOF_DCTELEM + add edi, byte 32*SIZEOF_JCOEF + dec eax + jnz near .quantloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jsimd.h b/media/libjpeg/simd/jsimd.h new file mode 100644 index 000000000..dc6ec430d --- /dev/null +++ b/media/libjpeg/simd/jsimd.h @@ -0,0 +1,871 @@ +/* + * simd/jsimd.h + * + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2011, 2014-2016, D. R. Commander. + * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. + * Copyright (C) 2014, Linaro Limited. + * Copyright (C) 2015-2016, Matthieu Darbois. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + */ + +/* Bitmask for supported acceleration methods */ + +#define JSIMD_NONE 0x00 +#define JSIMD_MMX 0x01 +#define JSIMD_3DNOW 0x02 +#define JSIMD_SSE 0x04 +#define JSIMD_SSE2 0x08 +#define JSIMD_ARM_NEON 0x10 +#define JSIMD_MIPS_DSPR2 0x20 +#define JSIMD_ALTIVEC 0x40 + +/* SIMD Ext: retrieve SIMD/CPU information */ +EXTERN(unsigned int) jpeg_simd_cpu_support (void); + +/* RGB & extended RGB --> YCC Colorspace Conversion */ +EXTERN(void) jsimd_rgb_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_ycc_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +extern const int jconst_rgb_ycc_convert_sse2[]; +EXTERN(void) jsimd_rgb_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_ycc_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_rgb_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_ycc_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_extrgb_ycc_convert_neon_slowld3 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_neon_slowld3 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_rgb_ycc_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_ycc_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_ycc_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_ycc_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_ycc_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_ycc_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_rgb_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_ycc_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +/* RGB & extended RGB --> Grayscale Colorspace Conversion */ +EXTERN(void) jsimd_rgb_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_gray_convert_mmx + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +extern const int jconst_rgb_gray_convert_sse2[]; +EXTERN(void) jsimd_rgb_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_gray_convert_sse2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_rgb_gray_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_gray_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_gray_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_gray_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_gray_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_gray_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_gray_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +EXTERN(void) jsimd_rgb_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_gray_convert_altivec + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + +/* YCC --> RGB & extended RGB Colorspace Conversion */ +EXTERN(void) jsimd_ycc_rgb_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_mmx + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +extern const int jconst_ycc_rgb_convert_sse2[]; +EXTERN(void) jsimd_ycc_rgb_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_sse2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +EXTERN(void) jsimd_ycc_rgb_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_rgb565_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +EXTERN(void) jsimd_ycc_extrgb_convert_neon_slowst3 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_neon_slowst3 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +EXTERN(void) jsimd_ycc_rgb_convert_mips_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_mips_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_mips_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_mips_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_mips_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_mips_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_mips_dspr2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +EXTERN(void) jsimd_ycc_rgb_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_altivec + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + +/* NULL Colorspace Conversion */ +EXTERN(void) jsimd_c_null_convert_mips_dspr2 + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows, int num_components); + +/* h2v1 Downsampling */ +EXTERN(void) jsimd_h2v1_downsample_mmx + (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v1_downsample_sse2 + (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v1_downsample_neon + (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v1_downsample_mips_dspr2 + (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v1_downsample_altivec + (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +/* h2v2 Downsampling */ +EXTERN(void) jsimd_h2v2_downsample_mmx + (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v2_downsample_sse2 + (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v2_downsample_neon + (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v2_downsample_mips_dspr2 + (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +EXTERN(void) jsimd_h2v2_downsample_altivec + (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data); + +/* h2v2 Smooth Downsampling */ +EXTERN(void) jsimd_h2v2_smooth_downsample_mips_dspr2 + (JSAMPARRAY input_data, JSAMPARRAY output_data, + JDIMENSION v_samp_factor, int max_v_samp_factor, + int smoothing_factor, JDIMENSION width_blocks, + JDIMENSION image_width); + + +/* Upsampling */ +EXTERN(void) jsimd_h2v1_upsample_mmx + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_upsample_mmx + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v1_upsample_sse2 + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_upsample_sse2 + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v1_upsample_mips_dspr2 + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_upsample_mips_dspr2 + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_int_upsample_mips_dspr2 + (UINT8 h_expand, UINT8 v_expand, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr, JDIMENSION output_width, + int max_v_samp_factor); + +EXTERN(void) jsimd_h2v1_upsample_altivec + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_upsample_altivec + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +/* Fancy Upsampling */ +EXTERN(void) jsimd_h2v1_fancy_upsample_mmx + (int max_v_samp_factor, JDIMENSION downsampled_width, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_fancy_upsample_mmx + (int max_v_samp_factor, JDIMENSION downsampled_width, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); + +extern const int jconst_fancy_upsample_sse2[]; +EXTERN(void) jsimd_h2v1_fancy_upsample_sse2 + (int max_v_samp_factor, JDIMENSION downsampled_width, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_fancy_upsample_sse2 + (int max_v_samp_factor, JDIMENSION downsampled_width, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v1_fancy_upsample_neon + (int max_v_samp_factor, JDIMENSION downsampled_width, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v1_fancy_upsample_mips_dspr2 + (int max_v_samp_factor, JDIMENSION downsampled_width, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_fancy_upsample_mips_dspr2 + (int max_v_samp_factor, JDIMENSION downsampled_width, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); + +EXTERN(void) jsimd_h2v1_fancy_upsample_altivec + (int max_v_samp_factor, JDIMENSION downsampled_width, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_fancy_upsample_altivec + (int max_v_samp_factor, JDIMENSION downsampled_width, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); + +/* Merged Upsampling */ +EXTERN(void) jsimd_h2v1_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); + +EXTERN(void) jsimd_h2v2_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmx + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); + +extern const int jconst_merged_upsample_sse2[]; +EXTERN(void) jsimd_h2v1_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); + +EXTERN(void) jsimd_h2v2_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_sse2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); + +EXTERN(void) jsimd_h2v1_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); +EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); +EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); +EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); +EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); +EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); +EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); + +EXTERN(void) jsimd_h2v2_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); +EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); +EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); +EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); +EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); +EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); +EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2 + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range); + +EXTERN(void) jsimd_h2v1_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); + +EXTERN(void) jsimd_h2v2_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_altivec + (JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); + +/* Sample Conversion */ +EXTERN(void) jsimd_convsamp_mmx + (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); + +EXTERN(void) jsimd_convsamp_sse2 + (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); + +EXTERN(void) jsimd_convsamp_neon + (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); + +EXTERN(void) jsimd_convsamp_mips_dspr2 + (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); + +EXTERN(void) jsimd_convsamp_altivec + (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); + +/* Floating Point Sample Conversion */ +EXTERN(void) jsimd_convsamp_float_3dnow + (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace); + +EXTERN(void) jsimd_convsamp_float_sse + (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace); + +EXTERN(void) jsimd_convsamp_float_sse2 + (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace); + +EXTERN(void) jsimd_convsamp_float_mips_dspr2 + (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace); + +/* Slow Integer Forward DCT */ +EXTERN(void) jsimd_fdct_islow_mmx (DCTELEM *data); + +extern const int jconst_fdct_islow_sse2[]; +EXTERN(void) jsimd_fdct_islow_sse2 (DCTELEM *data); + +EXTERN(void) jsimd_fdct_islow_neon (DCTELEM *data); + +EXTERN(void) jsimd_fdct_islow_mips_dspr2 (DCTELEM *data); + +EXTERN(void) jsimd_fdct_islow_altivec (DCTELEM *data); + +/* Fast Integer Forward DCT */ +EXTERN(void) jsimd_fdct_ifast_mmx (DCTELEM *data); + +extern const int jconst_fdct_ifast_sse2[]; +EXTERN(void) jsimd_fdct_ifast_sse2 (DCTELEM *data); + +EXTERN(void) jsimd_fdct_ifast_neon (DCTELEM *data); + +EXTERN(void) jsimd_fdct_ifast_mips_dspr2 (DCTELEM *data); + +EXTERN(void) jsimd_fdct_ifast_altivec (DCTELEM *data); + +/* Floating Point Forward DCT */ +EXTERN(void) jsimd_fdct_float_3dnow (FAST_FLOAT *data); + +extern const int jconst_fdct_float_sse[]; +EXTERN(void) jsimd_fdct_float_sse (FAST_FLOAT *data); + +/* Quantization */ +EXTERN(void) jsimd_quantize_mmx + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + +EXTERN(void) jsimd_quantize_sse2 + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + +EXTERN(void) jsimd_quantize_neon + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + +EXTERN(void) jsimd_quantize_mips_dspr2 + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + +EXTERN(void) jsimd_quantize_altivec + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + +/* Floating Point Quantization */ +EXTERN(void) jsimd_quantize_float_3dnow + (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace); + +EXTERN(void) jsimd_quantize_float_sse + (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace); + +EXTERN(void) jsimd_quantize_float_sse2 + (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace); + +EXTERN(void) jsimd_quantize_float_mips_dspr2 + (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace); + +/* Scaled Inverse DCT */ +EXTERN(void) jsimd_idct_2x2_mmx + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_4x4_mmx + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +extern const int jconst_idct_red_sse2[]; +EXTERN(void) jsimd_idct_2x2_sse2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_4x4_sse2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_2x2_neon + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_4x4_neon + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_2x2_mips_dspr2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_4x4_mips_dspr2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col, int *workspace); +EXTERN(void) jsimd_idct_6x6_mips_dspr2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); +EXTERN(void) jsimd_idct_12x12_pass1_mips_dspr2 + (JCOEFPTR coef_block, void *dct_table, int *workspace); +EXTERN(void) jsimd_idct_12x12_pass2_mips_dspr2 + (int *workspace, int *output); + +/* Slow Integer Inverse DCT */ +EXTERN(void) jsimd_idct_islow_mmx + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +extern const int jconst_idct_islow_sse2[]; +EXTERN(void) jsimd_idct_islow_sse2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_islow_neon + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_islow_mips_dspr2 + (void *dct_table, JCOEFPTR coef_block, int *output_buf, + JSAMPLE *output_col); + +EXTERN(void) jsimd_idct_islow_altivec + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +/* Fast Integer Inverse DCT */ +EXTERN(void) jsimd_idct_ifast_mmx + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +extern const int jconst_idct_ifast_sse2[]; +EXTERN(void) jsimd_idct_ifast_sse2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_ifast_neon + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +EXTERN(void) jsimd_idct_ifast_cols_mips_dspr2 + (JCOEF *inptr, IFAST_MULT_TYPE *quantptr, DCTELEM *wsptr, + const int *idct_coefs); +EXTERN(void) jsimd_idct_ifast_rows_mips_dspr2 + (DCTELEM *wsptr, JSAMPARRAY output_buf, JDIMENSION output_col, + const int *idct_coefs); + +EXTERN(void) jsimd_idct_ifast_altivec + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +/* Floating Point Inverse DCT */ +EXTERN(void) jsimd_idct_float_3dnow + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +extern const int jconst_idct_float_sse[]; +EXTERN(void) jsimd_idct_float_sse + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +extern const int jconst_idct_float_sse2[]; +EXTERN(void) jsimd_idct_float_sse2 + (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col); + +/* Huffman coding */ +extern const int jconst_huff_encode_one_block[]; +EXTERN(JOCTET*) jsimd_huff_encode_one_block_sse2 + (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val, + c_derived_tbl *dctbl, c_derived_tbl *actbl); + +EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon + (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val, + c_derived_tbl *dctbl, c_derived_tbl *actbl); + +EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon_slowtbl + (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val, + c_derived_tbl *dctbl, c_derived_tbl *actbl); diff --git a/media/libjpeg/simd/jsimd_altivec.h b/media/libjpeg/simd/jsimd_altivec.h new file mode 100644 index 000000000..62dbc5cdf --- /dev/null +++ b/media/libjpeg/simd/jsimd_altivec.h @@ -0,0 +1,99 @@ +/* + * AltiVec optimizations for libjpeg-turbo + * + * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../jinclude.h" +#include "../jpeglib.h" +#include "../jsimd.h" +#include "../jdct.h" +#include "../jsimddct.h" +#include "jsimd.h" +#include <altivec.h> + + +/* Common code */ + +#define __4X(a) a, a, a, a +#define __4X2(a, b) a, b, a, b, a, b, a, b +#define __8X(a) __4X(a), __4X(a) +#define __16X(a) __8X(a), __8X(a) + +#define TRANSPOSE(row, col) \ +{ \ + __vector short row04l, row04h, row15l, row15h, \ + row26l, row26h, row37l, row37h; \ + __vector short col01e, col01o, col23e, col23o, \ + col45e, col45o, col67e, col67o; \ + \ + /* transpose coefficients (phase 1) */ \ + row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \ + row04h = vec_mergel(row##0, row##4); /* row04h=(04 44 05 45 06 46 07 47) */ \ + row15l = vec_mergeh(row##1, row##5); /* row15l=(10 50 11 51 12 52 13 53) */ \ + row15h = vec_mergel(row##1, row##5); /* row15h=(14 54 15 55 16 56 17 57) */ \ + row26l = vec_mergeh(row##2, row##6); /* row26l=(20 60 21 61 22 62 23 63) */ \ + row26h = vec_mergel(row##2, row##6); /* row26h=(24 64 25 65 26 66 27 67) */ \ + row37l = vec_mergeh(row##3, row##7); /* row37l=(30 70 31 71 32 72 33 73) */ \ + row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \ + \ + /* transpose coefficients (phase 2) */ \ + col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61) */ \ + col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \ + col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \ + col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \ + col01o = vec_mergeh(row15l, row37l); /* col01o=(10 30 50 70 11 31 51 71) */ \ + col23o = vec_mergel(row15l, row37l); /* col23o=(12 32 52 72 13 33 53 73) */ \ + col45o = vec_mergeh(row15h, row37h); /* col45o=(14 34 54 74 15 35 55 75) */ \ + col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \ + \ + /* transpose coefficients (phase 3) */ \ + col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */ \ + col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */ \ + col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */ \ + col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */ \ + col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */ \ + col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */ \ + col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */ \ + col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \ +} + +#ifndef min +#define min(a,b) ((a) < (b) ? (a) : (b)) +#endif + + +/* Macros to abstract big/little endian bit twiddling */ + +#if __BIG_ENDIAN__ + +#define VEC_LD(a, b) vec_ld(a, b) +#define VEC_ST(a, b, c) vec_st(a, b, c) +#define VEC_UNPACKHU(a) vec_mergeh(pb_zero, a) +#define VEC_UNPACKLU(a) vec_mergel(pb_zero, a) + +#else + +#define VEC_LD(a, b) vec_vsx_ld(a, b) +#define VEC_ST(a, b, c) vec_vsx_st(a, b, c) +#define VEC_UNPACKHU(a) vec_mergeh(a, pb_zero) +#define VEC_UNPACKLU(a) vec_mergel(a, pb_zero) + +#endif diff --git a/media/libjpeg/simd/jsimd_arm.c b/media/libjpeg/simd/jsimd_arm.c new file mode 100644 index 000000000..61cd073e1 --- /dev/null +++ b/media/libjpeg/simd/jsimd_arm.c @@ -0,0 +1,727 @@ +/* + * jsimd_arm.c + * + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander. + * Copyright (C) 2015-2016, Matthieu Darbois. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * 32-bit ARM architecture. + */ + +#define JPEG_INTERNALS +#include "../jinclude.h" +#include "../jpeglib.h" +#include "../jsimd.h" +#include "../jdct.h" +#include "../jsimddct.h" +#include "jsimd.h" + +#include <stdio.h> +#include <string.h> +#include <ctype.h> + +static unsigned int simd_support = ~0; +static unsigned int simd_huffman = 1; + +#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + +#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024) + +LOCAL(int) +check_feature (char *buffer, char *feature) +{ + char *p; + if (*feature == 0) + return 0; + if (strncmp(buffer, "Features", 8) != 0) + return 0; + buffer += 8; + while (isspace(*buffer)) + buffer++; + + /* Check if 'feature' is present in the buffer as a separate word */ + while ((p = strstr(buffer, feature))) { + if (p > buffer && !isspace(*(p - 1))) { + buffer++; + continue; + } + p += strlen(feature); + if (*p != 0 && !isspace(*p)) { + buffer++; + continue; + } + return 1; + } + return 0; +} + +LOCAL(int) +parse_proc_cpuinfo (int bufsize) +{ + char *buffer = (char *)malloc(bufsize); + FILE *fd; + simd_support = 0; + + if (!buffer) + return 0; + + fd = fopen("/proc/cpuinfo", "r"); + if (fd) { + while (fgets(buffer, bufsize, fd)) { + if (!strchr(buffer, '\n') && !feof(fd)) { + /* "impossible" happened - insufficient size of the buffer! */ + fclose(fd); + free(buffer); + return 0; + } + if (check_feature(buffer, "neon")) + simd_support |= JSIMD_ARM_NEON; + } + fclose(fd); + } + free(buffer); + return 1; +} + +#endif + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ +LOCAL(void) +init_simd (void) +{ + char *env = NULL; +#if !defined(__ARM_NEON__) && defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + int bufsize = 1024; /* an initial guess for the line buffer size limit */ +#endif + + if (simd_support != ~0U) + return; + + simd_support = 0; + +#if defined(__ARM_NEON__) + simd_support |= JSIMD_ARM_NEON; +#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + /* We still have a chance to use NEON regardless of globally used + * -mcpu/-mfpu options passed to gcc by performing runtime detection via + * /proc/cpuinfo parsing on linux/android */ + while (!parse_proc_cpuinfo(bufsize)) { + bufsize *= 2; + if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT) + break; + } +#endif + + /* Force different settings through environment variables */ + env = getenv("JSIMD_FORCENEON"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = JSIMD_ARM_NEON; + env = getenv("JSIMD_FORCENONE"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = 0; + env = getenv("JSIMD_NOHUFFENC"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_huffman = 0; +} + +GLOBAL(int) +jsimd_can_rgb_ycc (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + void (*neonfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch(cinfo->in_color_space) { + case JCS_EXT_RGB: + neonfct=jsimd_extrgb_ycc_convert_neon; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct=jsimd_extrgbx_ycc_convert_neon; + break; + case JCS_EXT_BGR: + neonfct=jsimd_extbgr_ycc_convert_neon; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct=jsimd_extbgrx_ycc_convert_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct=jsimd_extxbgr_ycc_convert_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct=jsimd_extxrgb_ycc_convert_neon; + break; + default: + neonfct=jsimd_extrgb_ycc_convert_neon; + break; + } + + neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ +} + +GLOBAL(void) +jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch(cinfo->out_color_space) { + case JCS_EXT_RGB: + neonfct=jsimd_ycc_extrgb_convert_neon; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct=jsimd_ycc_extrgbx_convert_neon; + break; + case JCS_EXT_BGR: + neonfct=jsimd_ycc_extbgr_convert_neon; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct=jsimd_ycc_extbgrx_convert_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct=jsimd_ycc_extxbgr_convert_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct=jsimd_ycc_extxrgb_convert_neon; + break; + default: + neonfct=jsimd_ycc_extrgb_convert_neon; + break; + } + + neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row, + output_buf, num_rows); +} + +GLOBAL(int) +jsimd_can_h2v2_downsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ +} + +GLOBAL(void) +jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_upsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(void) +jsimd_h2v1_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ +} + +GLOBAL(int) +jsimd_can_convsamp (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + jsimd_convsamp_neon(sample_data, start_col, workspace); +} + +GLOBAL(void) +jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_fdct_islow (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow (DCTELEM *data) +{ +} + +GLOBAL(void) +jsimd_fdct_ifast (DCTELEM *data) +{ + jsimd_fdct_ifast_neon(data); +} + +GLOBAL(void) +jsimd_fdct_float (FAST_FLOAT *data) +{ +} + +GLOBAL(int) +jsimd_can_quantize (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors, + DCTELEM *workspace) +{ + jsimd_quantize_neon(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_idct_2x2 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(int) +jsimd_can_idct_islow (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(IFAST_MULT_TYPE) != 2) + return 0; + if (IFAST_SCALE_BITS != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block (void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON && simd_huffman) + return 1; + + return 0; +} + +GLOBAL(JOCTET*) +jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val, + dctbl, actbl); +} diff --git a/media/libjpeg/simd/jsimd_arm64.c b/media/libjpeg/simd/jsimd_arm64.c new file mode 100644 index 000000000..09449bb6f --- /dev/null +++ b/media/libjpeg/simd/jsimd_arm64.c @@ -0,0 +1,802 @@ +/* + * jsimd_arm64.c + * + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander. + * Copyright (C) 2015-2016, Matthieu Darbois. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * 64-bit ARM architecture. + */ + +#define JPEG_INTERNALS +#include "../jinclude.h" +#include "../jpeglib.h" +#include "../jsimd.h" +#include "../jdct.h" +#include "../jsimddct.h" +#include "jsimd.h" + +#include <stdio.h> +#include <string.h> +#include <ctype.h> + +#define JSIMD_FASTLD3 1 +#define JSIMD_FASTST3 2 +#define JSIMD_FASTTBL 4 + +static unsigned int simd_support = ~0; +static unsigned int simd_huffman = 1; +static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 | + JSIMD_FASTTBL; + +#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + +#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024) + +LOCAL(int) +check_cpuinfo (char *buffer, const char *field, char *value) +{ + char *p; + if (*value == 0) + return 0; + if (strncmp(buffer, field, strlen(field)) != 0) + return 0; + buffer += strlen(field); + while (isspace(*buffer)) + buffer++; + + /* Check if 'value' is present in the buffer as a separate word */ + while ((p = strstr(buffer, value))) { + if (p > buffer && !isspace(*(p - 1))) { + buffer++; + continue; + } + p += strlen(value); + if (*p != 0 && !isspace(*p)) { + buffer++; + continue; + } + return 1; + } + return 0; +} + +LOCAL(int) +parse_proc_cpuinfo (int bufsize) +{ + char *buffer = (char *)malloc(bufsize); + FILE *fd; + + if (!buffer) + return 0; + + fd = fopen("/proc/cpuinfo", "r"); + if (fd) { + while (fgets(buffer, bufsize, fd)) { + if (!strchr(buffer, '\n') && !feof(fd)) { + /* "impossible" happened - insufficient size of the buffer! */ + fclose(fd); + free(buffer); + return 0; + } + if (check_cpuinfo(buffer, "CPU part", "0xd03") || + check_cpuinfo(buffer, "CPU part", "0xd07")) + /* The Cortex-A53 has a slow tbl implementation. We can gain a few + percent speedup by disabling the use of that instruction. The + speedup on Cortex-A57 is more subtle but still measurable. */ + simd_features &= ~JSIMD_FASTTBL; + else if (check_cpuinfo(buffer, "CPU part", "0x0a1")) + /* The SIMD version of Huffman encoding is slower than the C version on + Cavium ThunderX. Also, ld3 and st3 are abyssmally slow on that + CPU. */ + simd_huffman = simd_features = 0; + } + fclose(fd); + } + free(buffer); + return 1; +} + +#endif + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ + +/* + * ARMv8 architectures support NEON extensions by default. + * It is no longer optional as it was with ARMv7. + */ + + +LOCAL(void) +init_simd (void) +{ + char *env = NULL; +#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + int bufsize = 1024; /* an initial guess for the line buffer size limit */ +#endif + + if (simd_support != ~0U) + return; + + simd_support = 0; + + simd_support |= JSIMD_ARM_NEON; +#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + while (!parse_proc_cpuinfo(bufsize)) { + bufsize *= 2; + if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT) + break; + } +#endif + + /* Force different settings through environment variables */ + env = getenv("JSIMD_FORCENEON"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = JSIMD_ARM_NEON; + env = getenv("JSIMD_FORCENONE"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = 0; + env = getenv("JSIMD_NOHUFFENC"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_huffman = 0; + env = getenv("JSIMD_FASTLD3"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_features |= JSIMD_FASTLD3; + if ((env != NULL) && (strcmp(env, "0") == 0)) + simd_features &= ~JSIMD_FASTLD3; + env = getenv("JSIMD_FASTST3"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_features |= JSIMD_FASTST3; + if ((env != NULL) && (strcmp(env, "0") == 0)) + simd_features &= ~JSIMD_FASTST3; +} + +GLOBAL(int) +jsimd_can_rgb_ycc (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + void (*neonfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch(cinfo->in_color_space) { + case JCS_EXT_RGB: + if (simd_features & JSIMD_FASTLD3) + neonfct=jsimd_extrgb_ycc_convert_neon; + else + neonfct=jsimd_extrgb_ycc_convert_neon_slowld3; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct=jsimd_extrgbx_ycc_convert_neon; + break; + case JCS_EXT_BGR: + if (simd_features & JSIMD_FASTLD3) + neonfct=jsimd_extbgr_ycc_convert_neon; + else + neonfct=jsimd_extbgr_ycc_convert_neon_slowld3; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct=jsimd_extbgrx_ycc_convert_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct=jsimd_extxbgr_ycc_convert_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct=jsimd_extxrgb_ycc_convert_neon; + break; + default: + if (simd_features & JSIMD_FASTLD3) + neonfct=jsimd_extrgb_ycc_convert_neon; + else + neonfct=jsimd_extrgb_ycc_convert_neon_slowld3; + break; + } + + neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ +} + +GLOBAL(void) +jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch(cinfo->out_color_space) { + case JCS_EXT_RGB: + if (simd_features & JSIMD_FASTST3) + neonfct=jsimd_ycc_extrgb_convert_neon; + else + neonfct=jsimd_ycc_extrgb_convert_neon_slowst3; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct=jsimd_ycc_extrgbx_convert_neon; + break; + case JCS_EXT_BGR: + if (simd_features & JSIMD_FASTST3) + neonfct=jsimd_ycc_extbgr_convert_neon; + else + neonfct=jsimd_ycc_extbgr_convert_neon_slowst3; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct=jsimd_ycc_extbgrx_convert_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct=jsimd_ycc_extxbgr_convert_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct=jsimd_ycc_extxrgb_convert_neon; + break; + default: + if (simd_features & JSIMD_FASTST3) + neonfct=jsimd_ycc_extrgb_convert_neon; + else + neonfct=jsimd_ycc_extrgb_convert_neon_slowst3; + break; + } + + neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row, + output_buf, num_rows); +} + +GLOBAL(int) +jsimd_can_h2v2_downsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (DCTSIZE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (DCTSIZE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(void) +jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(int) +jsimd_can_h2v2_upsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(void) +jsimd_h2v1_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ +} + +GLOBAL(int) +jsimd_can_convsamp (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + jsimd_convsamp_neon(sample_data, start_col, workspace); +} + +GLOBAL(void) +jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_fdct_islow (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow (DCTELEM *data) +{ + jsimd_fdct_islow_neon(data); +} + +GLOBAL(void) +jsimd_fdct_ifast (DCTELEM *data) +{ + jsimd_fdct_ifast_neon(data); +} + +GLOBAL(void) +jsimd_fdct_float (FAST_FLOAT *data) +{ +} + +GLOBAL(int) +jsimd_can_quantize (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors, + DCTELEM *workspace) +{ + jsimd_quantize_neon(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_idct_2x2 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(int) +jsimd_can_idct_islow (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(IFAST_MULT_TYPE) != 2) + return 0; + if (IFAST_SCALE_BITS != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block (void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON && simd_huffman) + return 1; + + return 0; +} + +GLOBAL(JOCTET*) +jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + if (simd_features & JSIMD_FASTTBL) + return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val, + dctbl, actbl); + else + return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block, + last_dc_val, dctbl, actbl); +} diff --git a/media/libjpeg/simd/jsimd_arm64_neon.S b/media/libjpeg/simd/jsimd_arm64_neon.S new file mode 100644 index 000000000..330985824 --- /dev/null +++ b/media/libjpeg/simd/jsimd_arm64_neon.S @@ -0,0 +1,3425 @@ +/* + * ARMv8 NEON optimizations for libjpeg-turbo + * + * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies). + * All Rights Reserved. + * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> + * Copyright (C) 2013-2014, Linaro Limited. All Rights Reserved. + * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org> + * Copyright (C) 2014-2016, D. R. Commander. All Rights Reserved. + * Copyright (C) 2015-2016, Matthieu Darbois. All Rights Reserved. + * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */ +#endif + +.text + + +#define RESPECT_STRICT_ALIGNMENT 1 + + +/*****************************************************************************/ + +/* Supplementary macro for setting function attributes */ +.macro asm_function fname +#ifdef __APPLE__ + .globl _\fname +_\fname: +#else + .global \fname +#ifdef __ELF__ + .hidden \fname + .type \fname, %function +#endif +\fname: +#endif +.endm + +/* Transpose elements of single 128 bit registers */ +.macro transpose_single x0, x1, xi, xilen, literal + ins \xi\xilen[0], \x0\xilen[0] + ins \x1\xilen[0], \x0\xilen[1] + trn1 \x0\literal, \x0\literal, \x1\literal + trn2 \x1\literal, \xi\literal, \x1\literal +.endm + +/* Transpose elements of 2 differnet registers */ +.macro transpose x0, x1, xi, xilen, literal + mov \xi\xilen, \x0\xilen + trn1 \x0\literal, \x0\literal, \x1\literal + trn2 \x1\literal, \xi\literal, \x1\literal +.endm + +/* Transpose a block of 4x4 coefficients in four 64-bit registers */ +.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen + mov \xi\xilen, \x0\xilen + trn1 \x0\x0len, \x0\x0len, \x2\x2len + trn2 \x2\x2len, \xi\x0len, \x2\x2len + mov \xi\xilen, \x1\xilen + trn1 \x1\x1len, \x1\x1len, \x3\x3len + trn2 \x3\x3len, \xi\x1len, \x3\x3len +.endm + +.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen + mov \xi\xilen, \x0\xilen + trn1 \x0\x0len, \x0\x0len, \x1\x1len + trn2 \x1\x2len, \xi\x0len, \x1\x2len + mov \xi\xilen, \x2\xilen + trn1 \x2\x2len, \x2\x2len, \x3\x3len + trn2 \x3\x2len, \xi\x1len, \x3\x3len +.endm + +.macro transpose_4x4 x0, x1, x2, x3, x5 + transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b + transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b +.endm + +.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3 + trn1 \t0\().8h, \l0\().8h, \l1\().8h + trn1 \t1\().8h, \l2\().8h, \l3\().8h + trn1 \t2\().8h, \l4\().8h, \l5\().8h + trn1 \t3\().8h, \l6\().8h, \l7\().8h + trn2 \l1\().8h, \l0\().8h, \l1\().8h + trn2 \l3\().8h, \l2\().8h, \l3\().8h + trn2 \l5\().8h, \l4\().8h, \l5\().8h + trn2 \l7\().8h, \l6\().8h, \l7\().8h + + trn1 \l4\().4s, \t2\().4s, \t3\().4s + trn2 \t3\().4s, \t2\().4s, \t3\().4s + trn1 \t2\().4s, \t0\().4s, \t1\().4s + trn2 \l2\().4s, \t0\().4s, \t1\().4s + trn1 \t0\().4s, \l1\().4s, \l3\().4s + trn2 \l3\().4s, \l1\().4s, \l3\().4s + trn2 \t1\().4s, \l5\().4s, \l7\().4s + trn1 \l5\().4s, \l5\().4s, \l7\().4s + + trn2 \l6\().2d, \l2\().2d, \t3\().2d + trn1 \l0\().2d, \t2\().2d, \l4\().2d + trn1 \l1\().2d, \t0\().2d, \l5\().2d + trn2 \l7\().2d, \l3\().2d, \t1\().2d + trn1 \l2\().2d, \l2\().2d, \t3\().2d + trn2 \l4\().2d, \t2\().2d, \l4\().2d + trn1 \l3\().2d, \l3\().2d, \t1\().2d + trn2 \l5\().2d, \t0\().2d, \l5\().2d +.endm + + +#define CENTERJSAMPLE 128 + +/*****************************************************************************/ + +/* + * Perform dequantization and inverse DCT on one block of coefficients. + * + * GLOBAL(void) + * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block, + * JSAMPARRAY output_buf, JDIMENSION output_col) + */ + +#define CONST_BITS 13 +#define PASS1_BITS 2 + +#define F_0_298 2446 /* FIX(0.298631336) */ +#define F_0_390 3196 /* FIX(0.390180644) */ +#define F_0_541 4433 /* FIX(0.541196100) */ +#define F_0_765 6270 /* FIX(0.765366865) */ +#define F_0_899 7373 /* FIX(0.899976223) */ +#define F_1_175 9633 /* FIX(1.175875602) */ +#define F_1_501 12299 /* FIX(1.501321110) */ +#define F_1_847 15137 /* FIX(1.847759065) */ +#define F_1_961 16069 /* FIX(1.961570560) */ +#define F_2_053 16819 /* FIX(2.053119869) */ +#define F_2_562 20995 /* FIX(2.562915447) */ +#define F_3_072 25172 /* FIX(3.072711026) */ + +.balign 16 +Ljsimd_idct_islow_neon_consts: + .short F_0_298 + .short -F_0_390 + .short F_0_541 + .short F_0_765 + .short - F_0_899 + .short F_1_175 + .short F_1_501 + .short - F_1_847 + .short - F_1_961 + .short F_2_053 + .short - F_2_562 + .short F_3_072 + .short 0 /* padding */ + .short 0 + .short 0 + .short 0 + +#undef F_0_298 +#undef F_0_390 +#undef F_0_541 +#undef F_0_765 +#undef F_0_899 +#undef F_1_175 +#undef F_1_501 +#undef F_1_847 +#undef F_1_961 +#undef F_2_053 +#undef F_2_562 +#undef F_3_072 + +#define XFIX_P_0_298 v0.h[0] +#define XFIX_N_0_390 v0.h[1] +#define XFIX_P_0_541 v0.h[2] +#define XFIX_P_0_765 v0.h[3] +#define XFIX_N_0_899 v0.h[4] +#define XFIX_P_1_175 v0.h[5] +#define XFIX_P_1_501 v0.h[6] +#define XFIX_N_1_847 v0.h[7] +#define XFIX_N_1_961 v1.h[0] +#define XFIX_P_2_053 v1.h[1] +#define XFIX_N_2_562 v1.h[2] +#define XFIX_P_3_072 v1.h[3] + +asm_function jsimd_idct_islow_neon + DCT_TABLE .req x0 + COEF_BLOCK .req x1 + OUTPUT_BUF .req x2 + OUTPUT_COL .req x3 + TMP1 .req x0 + TMP2 .req x1 + TMP3 .req x9 + TMP4 .req x10 + TMP5 .req x11 + TMP6 .req x12 + TMP7 .req x13 + TMP8 .req x14 + + /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't + guarantee that the upper (unused) 32 bits of x3 are valid. This + instruction ensures that those bits are set to zero. */ + uxtw x3, w3 + + sub sp, sp, #64 + adr x15, Ljsimd_idct_islow_neon_consts + mov x10, sp + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32 + ld1 {v0.8h, v1.8h}, [x15] + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64 + ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64 + ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64 + ld1 {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64 + + cmeq v16.8h, v3.8h, #0 + cmeq v26.8h, v4.8h, #0 + cmeq v27.8h, v5.8h, #0 + cmeq v28.8h, v6.8h, #0 + cmeq v29.8h, v7.8h, #0 + cmeq v30.8h, v8.8h, #0 + cmeq v31.8h, v9.8h, #0 + + and v10.16b, v16.16b, v26.16b + and v11.16b, v27.16b, v28.16b + and v12.16b, v29.16b, v30.16b + and v13.16b, v31.16b, v10.16b + and v14.16b, v11.16b, v12.16b + mul v2.8h, v2.8h, v18.8h + and v15.16b, v13.16b, v14.16b + shl v10.8h, v2.8h, #(PASS1_BITS) + sqxtn v16.8b, v15.8h + mov TMP1, v16.d[0] + mvn TMP2, TMP1 + + cbnz TMP2, 2f + /* case all AC coeffs are zeros */ + dup v2.2d, v10.d[0] + dup v6.2d, v10.d[1] + mov v3.16b, v2.16b + mov v7.16b, v6.16b + mov v4.16b, v2.16b + mov v8.16b, v6.16b + mov v5.16b, v2.16b + mov v9.16b, v6.16b +1: + /* for this transpose, we should organise data like this: + * 00, 01, 02, 03, 40, 41, 42, 43 + * 10, 11, 12, 13, 50, 51, 52, 53 + * 20, 21, 22, 23, 60, 61, 62, 63 + * 30, 31, 32, 33, 70, 71, 72, 73 + * 04, 05, 06, 07, 44, 45, 46, 47 + * 14, 15, 16, 17, 54, 55, 56, 57 + * 24, 25, 26, 27, 64, 65, 66, 67 + * 34, 35, 36, 37, 74, 75, 76, 77 + */ + trn1 v28.8h, v2.8h, v3.8h + trn1 v29.8h, v4.8h, v5.8h + trn1 v30.8h, v6.8h, v7.8h + trn1 v31.8h, v8.8h, v9.8h + trn2 v16.8h, v2.8h, v3.8h + trn2 v17.8h, v4.8h, v5.8h + trn2 v18.8h, v6.8h, v7.8h + trn2 v19.8h, v8.8h, v9.8h + trn1 v2.4s, v28.4s, v29.4s + trn1 v6.4s, v30.4s, v31.4s + trn1 v3.4s, v16.4s, v17.4s + trn1 v7.4s, v18.4s, v19.4s + trn2 v4.4s, v28.4s, v29.4s + trn2 v8.4s, v30.4s, v31.4s + trn2 v5.4s, v16.4s, v17.4s + trn2 v9.4s, v18.4s, v19.4s + /* Even part: reverse the even part of the forward DCT. */ + add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ + add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ + sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ + sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ + mov v21.16b, v19.16b /* tmp3 = z1 */ + mov v20.16b, v18.16b /* tmp3 = z1 */ + smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */ + smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */ + sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ + smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ + smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ + sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ + sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ + add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ + sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ + add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ + sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ + add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ + sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ + add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ + sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ + + /* Odd part per figure 8; the matrix is unitary and hence its + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. + */ + + add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ + + smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */ + smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */ + smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */ + smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */ + + smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */ + smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */ + smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */ + smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */ + + add v23.4s, v23.4s, v27.4s /* z3 += z5 */ + add v22.4s, v22.4s, v26.4s /* z3 += z5 */ + add v25.4s, v25.4s, v27.4s /* z4 += z5 */ + add v24.4s, v24.4s, v26.4s /* z4 += z5 */ + + add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ + add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ + add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ + add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ + add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ + add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ + add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ + add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ + + add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ + add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ + add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ + add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ + add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ + add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ + add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ + add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ + add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ + sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ + sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ + add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ + add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ + sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ + sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ + add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ + add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ + sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ + sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ + add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ + add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ + sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ + sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ + + shrn v2.4h, v18.4s, #16 /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */ + shrn v9.4h, v20.4s, #16 /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */ + shrn v3.4h, v22.4s, #16 /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */ + shrn v8.4h, v24.4s, #16 /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */ + shrn v4.4h, v26.4s, #16 /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */ + shrn v7.4h, v28.4s, #16 /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */ + shrn v5.4h, v14.4s, #16 /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */ + shrn v6.4h, v16.4s, #16 /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */ + shrn2 v2.8h, v19.4s, #16 /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */ + shrn2 v9.8h, v21.4s, #16 /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */ + shrn2 v3.8h, v23.4s, #16 /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */ + shrn2 v8.8h, v25.4s, #16 /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */ + shrn2 v4.8h, v27.4s, #16 /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */ + shrn2 v7.8h, v29.4s, #16 /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */ + shrn2 v5.8h, v15.4s, #16 /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */ + shrn2 v6.8h, v17.4s, #16 /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */ + movi v0.16b, #(CENTERJSAMPLE) + /* Prepare pointers (dual-issue with NEON instructions) */ + ldp TMP1, TMP2, [OUTPUT_BUF], 16 + sqrshrn v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16) + ldp TMP3, TMP4, [OUTPUT_BUF], 16 + sqrshrn v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16) + add TMP1, TMP1, OUTPUT_COL + sqrshrn v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16) + add TMP2, TMP2, OUTPUT_COL + sqrshrn v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16) + add TMP3, TMP3, OUTPUT_COL + sqrshrn2 v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16) + add TMP4, TMP4, OUTPUT_COL + sqrshrn2 v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16) + ldp TMP5, TMP6, [OUTPUT_BUF], 16 + sqrshrn2 v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16) + ldp TMP7, TMP8, [OUTPUT_BUF], 16 + sqrshrn2 v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16) + add TMP5, TMP5, OUTPUT_COL + add v16.16b, v28.16b, v0.16b + add TMP6, TMP6, OUTPUT_COL + add v18.16b, v29.16b, v0.16b + add TMP7, TMP7, OUTPUT_COL + add v20.16b, v30.16b, v0.16b + add TMP8, TMP8, OUTPUT_COL + add v22.16b, v31.16b, v0.16b + + /* Transpose the final 8-bit samples */ + trn1 v28.16b, v16.16b, v18.16b + trn1 v30.16b, v20.16b, v22.16b + trn2 v29.16b, v16.16b, v18.16b + trn2 v31.16b, v20.16b, v22.16b + + trn1 v16.8h, v28.8h, v30.8h + trn2 v18.8h, v28.8h, v30.8h + trn1 v20.8h, v29.8h, v31.8h + trn2 v22.8h, v29.8h, v31.8h + + uzp1 v28.4s, v16.4s, v18.4s + uzp2 v30.4s, v16.4s, v18.4s + uzp1 v29.4s, v20.4s, v22.4s + uzp2 v31.4s, v20.4s, v22.4s + + /* Store results to the output buffer */ + st1 {v28.d}[0], [TMP1] + st1 {v29.d}[0], [TMP2] + st1 {v28.d}[1], [TMP3] + st1 {v29.d}[1], [TMP4] + st1 {v30.d}[0], [TMP5] + st1 {v31.d}[0], [TMP6] + st1 {v30.d}[1], [TMP7] + st1 {v31.d}[1], [TMP8] + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32 + blr x30 + +.balign 16 +2: + mul v3.8h, v3.8h, v19.8h + mul v4.8h, v4.8h, v20.8h + mul v5.8h, v5.8h, v21.8h + add TMP4, xzr, TMP2, LSL #32 + mul v6.8h, v6.8h, v22.8h + mul v7.8h, v7.8h, v23.8h + adds TMP3, xzr, TMP2, LSR #32 + mul v8.8h, v8.8h, v24.8h + mul v9.8h, v9.8h, v25.8h + b.ne 3f + /* Right AC coef is zero */ + dup v15.2d, v10.d[1] + /* Even part: reverse the even part of the forward DCT. */ + add v18.4h, v4.4h, v8.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ + add v22.4h, v2.4h, v6.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + sub v26.4h, v2.4h, v6.4h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ + sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ + mov v20.16b, v18.16b /* tmp3 = z1 */ + sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ + smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */ + smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ + add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ + sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ + add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ + sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ + + /* Odd part per figure 8; the matrix is unitary and hence its + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. + */ + + add v22.4h, v9.4h, v5.4h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v24.4h, v7.4h, v3.4h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v18.4h, v9.4h, v3.4h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v20.4h, v7.4h, v5.4h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v26.4h, v22.4h, v24.4h /* z5 = z3 + z4 */ + + smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */ + smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */ + smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */ + smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */ + + add v22.4s, v22.4s, v26.4s /* z3 += z5 */ + add v24.4s, v24.4s, v26.4s /* z4 += z5 */ + + add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ + add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ + add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ + add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ + + add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ + add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ + add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ + add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ + sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ + add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ + sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ + add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ + sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ + add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ + sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ + + rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ + rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ + rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ + rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ + rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ + rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ + mov v6.16b, v15.16b + mov v7.16b, v15.16b + mov v8.16b, v15.16b + mov v9.16b, v15.16b + b 1b + +.balign 16 +3: + cbnz TMP4, 4f + /* Left AC coef is zero */ + dup v14.2d, v10.d[0] + /* Even part: reverse the even part of the forward DCT. */ + add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ + add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ + sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ + mov v21.16b, v19.16b /* tmp3 = z1 */ + smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */ + sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ + smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ + add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ + sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ + add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ + sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ + + /* Odd part per figure 8; the matrix is unitary and hence its + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. + */ + + add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ + + smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */ + smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */ + smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */ + smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */ + + add v23.4s, v23.4s, v27.4s /* z3 += z5 */ + add v22.4s, v22.4s, v26.4s /* z3 += z5 */ + add v25.4s, v25.4s, v27.4s /* z4 += z5 */ + add v24.4s, v24.4s, v26.4s /* z4 += z5 */ + + add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ + add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ + add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ + add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ + + add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ + add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ + add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ + add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ + sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ + add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ + sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ + add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ + sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ + add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ + sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ + + mov v2.16b, v14.16b + mov v3.16b, v14.16b + mov v4.16b, v14.16b + mov v5.16b, v14.16b + rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ + rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ + rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ + rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ + rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ + rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ + b 1b + +.balign 16 +4: + /* "No" AC coef is zero */ + /* Even part: reverse the even part of the forward DCT. */ + add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ + add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ + sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ + smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ + sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ + mov v21.16b, v19.16b /* tmp3 = z1 */ + mov v20.16b, v18.16b /* tmp3 = z1 */ + smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */ + smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */ + sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ + smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ + smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ + sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ + sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ + add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ + sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ + add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ + sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ + add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ + sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ + add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ + sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ + + /* Odd part per figure 8; the matrix is unitary and hence its + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. + */ + + add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ + add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ + add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ + + smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */ + smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */ + smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */ + smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */ + + smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ + smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ + smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ + smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ + smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ + smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */ + smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */ + smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */ + smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */ + + add v23.4s, v23.4s, v27.4s /* z3 += z5 */ + add v22.4s, v22.4s, v26.4s /* z3 += z5 */ + add v25.4s, v25.4s, v27.4s /* z4 += z5 */ + add v24.4s, v24.4s, v26.4s /* z4 += z5 */ + + add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ + add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ + add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ + add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ + add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ + add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ + add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ + add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ + + add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ + add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ + add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ + add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ + add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ + add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ + add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ + add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ + add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ + sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ + sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ + add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ + add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ + sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ + sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ + add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ + add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ + sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ + sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ + add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ + add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ + sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ + sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ + + rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ + rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ + rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ + rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ + rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ + rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ + rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ + rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ + rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ + rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ + rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ + rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ + rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ + rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ + b 1b + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + .unreq TMP5 + .unreq TMP6 + .unreq TMP7 + .unreq TMP8 + +#undef CENTERJSAMPLE +#undef CONST_BITS +#undef PASS1_BITS +#undef XFIX_P_0_298 +#undef XFIX_N_0_390 +#undef XFIX_P_0_541 +#undef XFIX_P_0_765 +#undef XFIX_N_0_899 +#undef XFIX_P_1_175 +#undef XFIX_P_1_501 +#undef XFIX_N_1_847 +#undef XFIX_N_1_961 +#undef XFIX_P_2_053 +#undef XFIX_N_2_562 +#undef XFIX_P_3_072 + + +/*****************************************************************************/ + +/* + * jsimd_idct_ifast_neon + * + * This function contains a fast, not so accurate integer implementation of + * the inverse DCT (Discrete Cosine Transform). It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' + * function from jidctfst.c + * + * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. + * But in ARM NEON case some extra additions are required because VQDMULH + * instruction can't handle the constants larger than 1. So the expressions + * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", + * which introduces an extra addition. Overall, there are 6 extra additions + * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. + */ + +#define XFIX_1_082392200 v0.h[0] +#define XFIX_1_414213562 v0.h[1] +#define XFIX_1_847759065 v0.h[2] +#define XFIX_2_613125930 v0.h[3] + +.balign 16 +Ljsimd_idct_ifast_neon_consts: + .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ + .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ + .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ + .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ + +asm_function jsimd_idct_ifast_neon + + DCT_TABLE .req x0 + COEF_BLOCK .req x1 + OUTPUT_BUF .req x2 + OUTPUT_COL .req x3 + TMP1 .req x0 + TMP2 .req x1 + TMP3 .req x9 + TMP4 .req x10 + TMP5 .req x11 + TMP6 .req x12 + TMP7 .req x13 + TMP8 .req x14 + + /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't + guarantee that the upper (unused) 32 bits of x3 are valid. This + instruction ensures that those bits are set to zero. */ + uxtw x3, w3 + + /* Load and dequantize coefficients into NEON registers + * with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 ( v16.8h ) + * 1 | d18 | d19 ( v17.8h ) + * 2 | d20 | d21 ( v18.8h ) + * 3 | d22 | d23 ( v19.8h ) + * 4 | d24 | d25 ( v20.8h ) + * 5 | d26 | d27 ( v21.8h ) + * 6 | d28 | d29 ( v22.8h ) + * 7 | d30 | d31 ( v23.8h ) + */ + /* Save NEON registers used in fast IDCT */ + adr TMP5, Ljsimd_idct_ifast_neon_consts + ld1 {v16.8h, v17.8h}, [COEF_BLOCK], 32 + ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 + ld1 {v18.8h, v19.8h}, [COEF_BLOCK], 32 + mul v16.8h, v16.8h, v0.8h + ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 + mul v17.8h, v17.8h, v1.8h + ld1 {v20.8h, v21.8h}, [COEF_BLOCK], 32 + mul v18.8h, v18.8h, v2.8h + ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 + mul v19.8h, v19.8h, v3.8h + ld1 {v22.8h, v23.8h}, [COEF_BLOCK], 32 + mul v20.8h, v20.8h, v0.8h + ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 + mul v22.8h, v22.8h, v2.8h + mul v21.8h, v21.8h, v1.8h + ld1 {v0.4h}, [TMP5] /* load constants */ + mul v23.8h, v23.8h, v3.8h + + /* 1-D IDCT, pass 1 */ + sub v2.8h, v18.8h, v22.8h + add v22.8h, v18.8h, v22.8h + sub v1.8h, v19.8h, v21.8h + add v21.8h, v19.8h, v21.8h + sub v5.8h, v17.8h, v23.8h + add v23.8h, v17.8h, v23.8h + sqdmulh v4.8h, v2.8h, XFIX_1_414213562 + sqdmulh v6.8h, v1.8h, XFIX_2_613125930 + add v3.8h, v1.8h, v1.8h + sub v1.8h, v5.8h, v1.8h + add v18.8h, v2.8h, v4.8h + sqdmulh v4.8h, v1.8h, XFIX_1_847759065 + sub v2.8h, v23.8h, v21.8h + add v3.8h, v3.8h, v6.8h + sqdmulh v6.8h, v2.8h, XFIX_1_414213562 + add v1.8h, v1.8h, v4.8h + sqdmulh v4.8h, v5.8h, XFIX_1_082392200 + sub v18.8h, v18.8h, v22.8h + add v2.8h, v2.8h, v6.8h + sub v6.8h, v16.8h, v20.8h + add v20.8h, v16.8h, v20.8h + add v17.8h, v5.8h, v4.8h + add v5.8h, v6.8h, v18.8h + sub v18.8h, v6.8h, v18.8h + add v6.8h, v23.8h, v21.8h + add v16.8h, v20.8h, v22.8h + sub v3.8h, v6.8h, v3.8h + sub v20.8h, v20.8h, v22.8h + sub v3.8h, v3.8h, v1.8h + sub v1.8h, v17.8h, v1.8h + add v2.8h, v3.8h, v2.8h + sub v23.8h, v16.8h, v6.8h + add v1.8h, v1.8h, v2.8h + add v16.8h, v16.8h, v6.8h + add v22.8h, v5.8h, v3.8h + sub v17.8h, v5.8h, v3.8h + sub v21.8h, v18.8h, v2.8h + add v18.8h, v18.8h, v2.8h + sub v19.8h, v20.8h, v1.8h + add v20.8h, v20.8h, v1.8h + transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31 + /* 1-D IDCT, pass 2 */ + sub v2.8h, v18.8h, v22.8h + add v22.8h, v18.8h, v22.8h + sub v1.8h, v19.8h, v21.8h + add v21.8h, v19.8h, v21.8h + sub v5.8h, v17.8h, v23.8h + add v23.8h, v17.8h, v23.8h + sqdmulh v4.8h, v2.8h, XFIX_1_414213562 + sqdmulh v6.8h, v1.8h, XFIX_2_613125930 + add v3.8h, v1.8h, v1.8h + sub v1.8h, v5.8h, v1.8h + add v18.8h, v2.8h, v4.8h + sqdmulh v4.8h, v1.8h, XFIX_1_847759065 + sub v2.8h, v23.8h, v21.8h + add v3.8h, v3.8h, v6.8h + sqdmulh v6.8h, v2.8h, XFIX_1_414213562 + add v1.8h, v1.8h, v4.8h + sqdmulh v4.8h, v5.8h, XFIX_1_082392200 + sub v18.8h, v18.8h, v22.8h + add v2.8h, v2.8h, v6.8h + sub v6.8h, v16.8h, v20.8h + add v20.8h, v16.8h, v20.8h + add v17.8h, v5.8h, v4.8h + add v5.8h, v6.8h, v18.8h + sub v18.8h, v6.8h, v18.8h + add v6.8h, v23.8h, v21.8h + add v16.8h, v20.8h, v22.8h + sub v3.8h, v6.8h, v3.8h + sub v20.8h, v20.8h, v22.8h + sub v3.8h, v3.8h, v1.8h + sub v1.8h, v17.8h, v1.8h + add v2.8h, v3.8h, v2.8h + sub v23.8h, v16.8h, v6.8h + add v1.8h, v1.8h, v2.8h + add v16.8h, v16.8h, v6.8h + add v22.8h, v5.8h, v3.8h + sub v17.8h, v5.8h, v3.8h + sub v21.8h, v18.8h, v2.8h + add v18.8h, v18.8h, v2.8h + sub v19.8h, v20.8h, v1.8h + add v20.8h, v20.8h, v1.8h + /* Descale to 8-bit and range limit */ + movi v0.16b, #0x80 + /* Prepare pointers (dual-issue with NEON instructions) */ + ldp TMP1, TMP2, [OUTPUT_BUF], 16 + sqshrn v28.8b, v16.8h, #5 + ldp TMP3, TMP4, [OUTPUT_BUF], 16 + sqshrn v29.8b, v17.8h, #5 + add TMP1, TMP1, OUTPUT_COL + sqshrn v30.8b, v18.8h, #5 + add TMP2, TMP2, OUTPUT_COL + sqshrn v31.8b, v19.8h, #5 + add TMP3, TMP3, OUTPUT_COL + sqshrn2 v28.16b, v20.8h, #5 + add TMP4, TMP4, OUTPUT_COL + sqshrn2 v29.16b, v21.8h, #5 + ldp TMP5, TMP6, [OUTPUT_BUF], 16 + sqshrn2 v30.16b, v22.8h, #5 + ldp TMP7, TMP8, [OUTPUT_BUF], 16 + sqshrn2 v31.16b, v23.8h, #5 + add TMP5, TMP5, OUTPUT_COL + add v16.16b, v28.16b, v0.16b + add TMP6, TMP6, OUTPUT_COL + add v18.16b, v29.16b, v0.16b + add TMP7, TMP7, OUTPUT_COL + add v20.16b, v30.16b, v0.16b + add TMP8, TMP8, OUTPUT_COL + add v22.16b, v31.16b, v0.16b + + /* Transpose the final 8-bit samples */ + trn1 v28.16b, v16.16b, v18.16b + trn1 v30.16b, v20.16b, v22.16b + trn2 v29.16b, v16.16b, v18.16b + trn2 v31.16b, v20.16b, v22.16b + + trn1 v16.8h, v28.8h, v30.8h + trn2 v18.8h, v28.8h, v30.8h + trn1 v20.8h, v29.8h, v31.8h + trn2 v22.8h, v29.8h, v31.8h + + uzp1 v28.4s, v16.4s, v18.4s + uzp2 v30.4s, v16.4s, v18.4s + uzp1 v29.4s, v20.4s, v22.4s + uzp2 v31.4s, v20.4s, v22.4s + + /* Store results to the output buffer */ + st1 {v28.d}[0], [TMP1] + st1 {v29.d}[0], [TMP2] + st1 {v28.d}[1], [TMP3] + st1 {v29.d}[1], [TMP4] + st1 {v30.d}[0], [TMP5] + st1 {v31.d}[0], [TMP6] + st1 {v30.d}[1], [TMP7] + st1 {v31.d}[1], [TMP8] + blr x30 + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + .unreq TMP5 + .unreq TMP6 + .unreq TMP7 + .unreq TMP8 + + +/*****************************************************************************/ + +/* + * jsimd_idct_4x4_neon + * + * This function contains inverse-DCT code for getting reduced-size + * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' + * function from jpeg-6b (jidctred.c). + * + * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which + * requires much less arithmetic operations and hence should be faster. + * The primary purpose of this particular NEON optimized function is + * bit exact compatibility with jpeg-6b. + * + * TODO: a bit better instructions scheduling can be achieved by expanding + * idct_helper/transpose_4x4 macros and reordering instructions, + * but readability will suffer somewhat. + */ + +#define CONST_BITS 13 + +#define FIX_0_211164243 (1730) /* FIX(0.211164243) */ +#define FIX_0_509795579 (4176) /* FIX(0.509795579) */ +#define FIX_0_601344887 (4926) /* FIX(0.601344887) */ +#define FIX_0_720959822 (5906) /* FIX(0.720959822) */ +#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ +#define FIX_0_850430095 (6967) /* FIX(0.850430095) */ +#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ +#define FIX_1_061594337 (8697) /* FIX(1.061594337) */ +#define FIX_1_272758580 (10426) /* FIX(1.272758580) */ +#define FIX_1_451774981 (11893) /* FIX(1.451774981) */ +#define FIX_1_847759065 (15137) /* FIX(1.847759065) */ +#define FIX_2_172734803 (17799) /* FIX(2.172734803) */ +#define FIX_2_562915447 (20995) /* FIX(2.562915447) */ +#define FIX_3_624509785 (29692) /* FIX(3.624509785) */ + +.balign 16 +Ljsimd_idct_4x4_neon_consts: + .short FIX_1_847759065 /* v0.h[0] */ + .short -FIX_0_765366865 /* v0.h[1] */ + .short -FIX_0_211164243 /* v0.h[2] */ + .short FIX_1_451774981 /* v0.h[3] */ + .short -FIX_2_172734803 /* d1[0] */ + .short FIX_1_061594337 /* d1[1] */ + .short -FIX_0_509795579 /* d1[2] */ + .short -FIX_0_601344887 /* d1[3] */ + .short FIX_0_899976223 /* v2.h[0] */ + .short FIX_2_562915447 /* v2.h[1] */ + .short 1 << (CONST_BITS+1) /* v2.h[2] */ + .short 0 /* v2.h[3] */ + +.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 + smull v28.4s, \x4, v2.h[2] + smlal v28.4s, \x8, v0.h[0] + smlal v28.4s, \x14, v0.h[1] + + smull v26.4s, \x16, v1.h[2] + smlal v26.4s, \x12, v1.h[3] + smlal v26.4s, \x10, v2.h[0] + smlal v26.4s, \x6, v2.h[1] + + smull v30.4s, \x4, v2.h[2] + smlsl v30.4s, \x8, v0.h[0] + smlsl v30.4s, \x14, v0.h[1] + + smull v24.4s, \x16, v0.h[2] + smlal v24.4s, \x12, v0.h[3] + smlal v24.4s, \x10, v1.h[0] + smlal v24.4s, \x6, v1.h[1] + + add v20.4s, v28.4s, v26.4s + sub v28.4s, v28.4s, v26.4s + + .if \shift > 16 + srshr v20.4s, v20.4s, #\shift + srshr v28.4s, v28.4s, #\shift + xtn \y26, v20.4s + xtn \y29, v28.4s + .else + rshrn \y26, v20.4s, #\shift + rshrn \y29, v28.4s, #\shift + .endif + + add v20.4s, v30.4s, v24.4s + sub v30.4s, v30.4s, v24.4s + + .if \shift > 16 + srshr v20.4s, v20.4s, #\shift + srshr v30.4s, v30.4s, #\shift + xtn \y27, v20.4s + xtn \y28, v30.4s + .else + rshrn \y27, v20.4s, #\shift + rshrn \y28, v30.4s, #\shift + .endif +.endm + +asm_function jsimd_idct_4x4_neon + + DCT_TABLE .req x0 + COEF_BLOCK .req x1 + OUTPUT_BUF .req x2 + OUTPUT_COL .req x3 + TMP1 .req x0 + TMP2 .req x1 + TMP3 .req x2 + TMP4 .req x15 + + /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't + guarantee that the upper (unused) 32 bits of x3 are valid. This + instruction ensures that those bits are set to zero. */ + uxtw x3, w3 + + /* Save all used NEON registers */ + sub sp, sp, 64 + mov x9, sp + /* Load constants (v3.4h is just used for padding) */ + adr TMP4, Ljsimd_idct_4x4_neon_consts + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4] + + /* Load all COEF_BLOCK into NEON registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | v4.4h | v5.4h + * 1 | v6.4h | v7.4h + * 2 | v8.4h | v9.4h + * 3 | v10.4h | v11.4h + * 4 | - | - + * 5 | v12.4h | v13.4h + * 6 | v14.4h | v15.4h + * 7 | v16.4h | v17.4h + */ + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 + ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32 + add COEF_BLOCK, COEF_BLOCK, #16 + ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32 + ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 + /* dequantize */ + ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 + mul v4.4h, v4.4h, v18.4h + mul v5.4h, v5.4h, v19.4h + ins v4.d[1], v5.d[0] /* 128 bit q4 */ + ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32 + mul v6.4h, v6.4h, v20.4h + mul v7.4h, v7.4h, v21.4h + ins v6.d[1], v7.d[0] /* 128 bit q6 */ + mul v8.4h, v8.4h, v22.4h + mul v9.4h, v9.4h, v23.4h + ins v8.d[1], v9.d[0] /* 128 bit q8 */ + add DCT_TABLE, DCT_TABLE, #16 + ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32 + mul v10.4h, v10.4h, v24.4h + mul v11.4h, v11.4h, v25.4h + ins v10.d[1], v11.d[0] /* 128 bit q10 */ + mul v12.4h, v12.4h, v26.4h + mul v13.4h, v13.4h, v27.4h + ins v12.d[1], v13.d[0] /* 128 bit q12 */ + ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 + mul v14.4h, v14.4h, v28.4h + mul v15.4h, v15.4h, v29.4h + ins v14.d[1], v15.d[0] /* 128 bit q14 */ + mul v16.4h, v16.4h, v30.4h + mul v17.4h, v17.4h, v31.4h + ins v16.d[1], v17.d[0] /* 128 bit q16 */ + + /* Pass 1 */ + idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \ + v4.4h, v6.4h, v8.4h, v10.4h + transpose_4x4 v4, v6, v8, v10, v3 + ins v10.d[1], v11.d[0] + idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \ + v5.4h, v7.4h, v9.4h, v11.4h + transpose_4x4 v5, v7, v9, v11, v3 + ins v10.d[1], v11.d[0] + + /* Pass 2 */ + idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \ + v26.4h, v27.4h, v28.4h, v29.4h + transpose_4x4 v26, v27, v28, v29, v3 + + /* Range limit */ + movi v30.8h, #0x80 + ins v26.d[1], v27.d[0] + ins v28.d[1], v29.d[0] + add v26.8h, v26.8h, v30.8h + add v28.8h, v28.8h, v30.8h + sqxtun v26.8b, v26.8h + sqxtun v27.8b, v28.8h + + /* Store results to the output buffer */ + ldp TMP1, TMP2, [OUTPUT_BUF], 16 + ldp TMP3, TMP4, [OUTPUT_BUF] + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + add TMP3, TMP3, OUTPUT_COL + add TMP4, TMP4, OUTPUT_COL + +#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT + /* We can use much less instructions on little endian systems if the + * OS kernel is not configured to trap unaligned memory accesses + */ + st1 {v26.s}[0], [TMP1], 4 + st1 {v27.s}[0], [TMP3], 4 + st1 {v26.s}[1], [TMP2], 4 + st1 {v27.s}[1], [TMP4], 4 +#else + st1 {v26.b}[0], [TMP1], 1 + st1 {v27.b}[0], [TMP3], 1 + st1 {v26.b}[1], [TMP1], 1 + st1 {v27.b}[1], [TMP3], 1 + st1 {v26.b}[2], [TMP1], 1 + st1 {v27.b}[2], [TMP3], 1 + st1 {v26.b}[3], [TMP1], 1 + st1 {v27.b}[3], [TMP3], 1 + + st1 {v26.b}[4], [TMP2], 1 + st1 {v27.b}[4], [TMP4], 1 + st1 {v26.b}[5], [TMP2], 1 + st1 {v27.b}[5], [TMP4], 1 + st1 {v26.b}[6], [TMP2], 1 + st1 {v27.b}[6], [TMP4], 1 + st1 {v26.b}[7], [TMP2], 1 + st1 {v27.b}[7], [TMP4], 1 +#endif + + /* vpop {v8.4h - v15.4h} ;not available */ + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + blr x30 + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + +.purgem idct_helper + + +/*****************************************************************************/ + +/* + * jsimd_idct_2x2_neon + * + * This function contains inverse-DCT code for getting reduced-size + * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' + * function from jpeg-6b (jidctred.c). + * + * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which + * requires much less arithmetic operations and hence should be faster. + * The primary purpose of this particular NEON optimized function is + * bit exact compatibility with jpeg-6b. + */ + +.balign 8 +Ljsimd_idct_2x2_neon_consts: + .short -FIX_0_720959822 /* v14[0] */ + .short FIX_0_850430095 /* v14[1] */ + .short -FIX_1_272758580 /* v14[2] */ + .short FIX_3_624509785 /* v14[3] */ + +.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 + sshll v15.4s, \x4, #15 + smull v26.4s, \x6, v14.h[3] + smlal v26.4s, \x10, v14.h[2] + smlal v26.4s, \x12, v14.h[1] + smlal v26.4s, \x16, v14.h[0] + + add v20.4s, v15.4s, v26.4s + sub v15.4s, v15.4s, v26.4s + + .if \shift > 16 + srshr v20.4s, v20.4s, #\shift + srshr v15.4s, v15.4s, #\shift + xtn \y26, v20.4s + xtn \y27, v15.4s + .else + rshrn \y26, v20.4s, #\shift + rshrn \y27, v15.4s, #\shift + .endif +.endm + +asm_function jsimd_idct_2x2_neon + + DCT_TABLE .req x0 + COEF_BLOCK .req x1 + OUTPUT_BUF .req x2 + OUTPUT_COL .req x3 + TMP1 .req x0 + TMP2 .req x15 + + /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't + guarantee that the upper (unused) 32 bits of x3 are valid. This + instruction ensures that those bits are set to zero. */ + uxtw x3, w3 + + /* vpush {v8.4h - v15.4h} ; not available */ + sub sp, sp, 64 + mov x9, sp + + /* Load constants */ + adr TMP2, Ljsimd_idct_2x2_neon_consts + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 + ld1 {v14.4h}, [TMP2] + + /* Load all COEF_BLOCK into NEON registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | v4.4h | v5.4h + * 1 | v6.4h | v7.4h + * 2 | - | - + * 3 | v10.4h | v11.4h + * 4 | - | - + * 5 | v12.4h | v13.4h + * 6 | - | - + * 7 | v16.4h | v17.4h + */ + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 + add COEF_BLOCK, COEF_BLOCK, #16 + ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16 + add COEF_BLOCK, COEF_BLOCK, #16 + ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16 + add COEF_BLOCK, COEF_BLOCK, #16 + ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 + /* Dequantize */ + ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 + mul v4.4h, v4.4h, v18.4h + mul v5.4h, v5.4h, v19.4h + ins v4.d[1], v5.d[0] + mul v6.4h, v6.4h, v20.4h + mul v7.4h, v7.4h, v21.4h + ins v6.d[1], v7.d[0] + add DCT_TABLE, DCT_TABLE, #16 + ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16 + mul v10.4h, v10.4h, v24.4h + mul v11.4h, v11.4h, v25.4h + ins v10.d[1], v11.d[0] + add DCT_TABLE, DCT_TABLE, #16 + ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16 + mul v12.4h, v12.4h, v26.4h + mul v13.4h, v13.4h, v27.4h + ins v12.d[1], v13.d[0] + add DCT_TABLE, DCT_TABLE, #16 + ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 + mul v16.4h, v16.4h, v30.4h + mul v17.4h, v17.4h, v31.4h + ins v16.d[1], v17.d[0] + + /* Pass 1 */ +#if 0 + idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h + transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h + idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h + transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h +#else + smull v26.4s, v6.4h, v14.h[3] + smlal v26.4s, v10.4h, v14.h[2] + smlal v26.4s, v12.4h, v14.h[1] + smlal v26.4s, v16.4h, v14.h[0] + smull v24.4s, v7.4h, v14.h[3] + smlal v24.4s, v11.4h, v14.h[2] + smlal v24.4s, v13.4h, v14.h[1] + smlal v24.4s, v17.4h, v14.h[0] + sshll v15.4s, v4.4h, #15 + sshll v30.4s, v5.4h, #15 + add v20.4s, v15.4s, v26.4s + sub v15.4s, v15.4s, v26.4s + rshrn v4.4h, v20.4s, #13 + rshrn v6.4h, v15.4s, #13 + add v20.4s, v30.4s, v24.4s + sub v15.4s, v30.4s, v24.4s + rshrn v5.4h, v20.4s, #13 + rshrn v7.4h, v15.4s, #13 + ins v4.d[1], v5.d[0] + ins v6.d[1], v7.d[0] + transpose v4, v6, v3, .16b, .8h + transpose v6, v10, v3, .16b, .4s + ins v11.d[0], v10.d[1] + ins v7.d[0], v6.d[1] +#endif + + /* Pass 2 */ + idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h + + /* Range limit */ + movi v30.8h, #0x80 + ins v26.d[1], v27.d[0] + add v26.8h, v26.8h, v30.8h + sqxtun v30.8b, v26.8h + ins v26.d[0], v30.d[0] + sqxtun v27.8b, v26.8h + + /* Store results to the output buffer */ + ldp TMP1, TMP2, [OUTPUT_BUF] + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + + st1 {v26.b}[0], [TMP1], 1 + st1 {v27.b}[4], [TMP1], 1 + st1 {v26.b}[1], [TMP2], 1 + st1 {v27.b}[5], [TMP2], 1 + + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + blr x30 + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + +.purgem idct_helper + + +/*****************************************************************************/ + +/* + * jsimd_ycc_extrgb_convert_neon + * jsimd_ycc_extbgr_convert_neon + * jsimd_ycc_extrgbx_convert_neon + * jsimd_ycc_extbgrx_convert_neon + * jsimd_ycc_extxbgr_convert_neon + * jsimd_ycc_extxrgb_convert_neon + * + * Colorspace conversion YCbCr -> RGB + */ + +.macro do_load size + .if \size == 8 + ld1 {v4.8b}, [U], 8 + ld1 {v5.8b}, [V], 8 + ld1 {v0.8b}, [Y], 8 + prfm pldl1keep, [U, #64] + prfm pldl1keep, [V, #64] + prfm pldl1keep, [Y, #64] + .elseif \size == 4 + ld1 {v4.b}[0], [U], 1 + ld1 {v4.b}[1], [U], 1 + ld1 {v4.b}[2], [U], 1 + ld1 {v4.b}[3], [U], 1 + ld1 {v5.b}[0], [V], 1 + ld1 {v5.b}[1], [V], 1 + ld1 {v5.b}[2], [V], 1 + ld1 {v5.b}[3], [V], 1 + ld1 {v0.b}[0], [Y], 1 + ld1 {v0.b}[1], [Y], 1 + ld1 {v0.b}[2], [Y], 1 + ld1 {v0.b}[3], [Y], 1 + .elseif \size == 2 + ld1 {v4.b}[4], [U], 1 + ld1 {v4.b}[5], [U], 1 + ld1 {v5.b}[4], [V], 1 + ld1 {v5.b}[5], [V], 1 + ld1 {v0.b}[4], [Y], 1 + ld1 {v0.b}[5], [Y], 1 + .elseif \size == 1 + ld1 {v4.b}[6], [U], 1 + ld1 {v5.b}[6], [V], 1 + ld1 {v0.b}[6], [Y], 1 + .else + .error unsupported macroblock size + .endif +.endm + +.macro do_store bpp, size, fast_st3 + .if \bpp == 24 + .if \size == 8 + .if \fast_st3 == 1 + st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24 + .else + st1 {v10.b}[0], [RGB], #1 + st1 {v11.b}[0], [RGB], #1 + st1 {v12.b}[0], [RGB], #1 + + st1 {v10.b}[1], [RGB], #1 + st1 {v11.b}[1], [RGB], #1 + st1 {v12.b}[1], [RGB], #1 + + st1 {v10.b}[2], [RGB], #1 + st1 {v11.b}[2], [RGB], #1 + st1 {v12.b}[2], [RGB], #1 + + st1 {v10.b}[3], [RGB], #1 + st1 {v11.b}[3], [RGB], #1 + st1 {v12.b}[3], [RGB], #1 + + st1 {v10.b}[4], [RGB], #1 + st1 {v11.b}[4], [RGB], #1 + st1 {v12.b}[4], [RGB], #1 + + st1 {v10.b}[5], [RGB], #1 + st1 {v11.b}[5], [RGB], #1 + st1 {v12.b}[5], [RGB], #1 + + st1 {v10.b}[6], [RGB], #1 + st1 {v11.b}[6], [RGB], #1 + st1 {v12.b}[6], [RGB], #1 + + st1 {v10.b}[7], [RGB], #1 + st1 {v11.b}[7], [RGB], #1 + st1 {v12.b}[7], [RGB], #1 + .endif + .elseif \size == 4 + st3 {v10.b, v11.b, v12.b}[0], [RGB], 3 + st3 {v10.b, v11.b, v12.b}[1], [RGB], 3 + st3 {v10.b, v11.b, v12.b}[2], [RGB], 3 + st3 {v10.b, v11.b, v12.b}[3], [RGB], 3 + .elseif \size == 2 + st3 {v10.b, v11.b, v12.b}[4], [RGB], 3 + st3 {v10.b, v11.b, v12.b}[5], [RGB], 3 + .elseif \size == 1 + st3 {v10.b, v11.b, v12.b}[6], [RGB], 3 + .else + .error unsupported macroblock size + .endif + .elseif \bpp == 32 + .if \size == 8 + st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32 + .elseif \size == 4 + st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4 + st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4 + st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4 + st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4 + .elseif \size == 2 + st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4 + st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4 + .elseif \size == 1 + st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4 + .else + .error unsupported macroblock size + .endif + .elseif \bpp==16 + .if \size == 8 + st1 {v25.8h}, [RGB], 16 + .elseif \size == 4 + st1 {v25.4h}, [RGB], 8 + .elseif \size == 2 + st1 {v25.h}[4], [RGB], 2 + st1 {v25.h}[5], [RGB], 2 + .elseif \size == 1 + st1 {v25.h}[6], [RGB], 2 + .else + .error unsupported macroblock size + .endif + .else + .error unsupported bpp + .endif +.endm + +.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \ + g_offs, gsize, b_offs, bsize, \ + defsize, fast_st3 + +/* + * 2-stage pipelined YCbCr->RGB conversion + */ + +.macro do_yuv_to_rgb_stage1 + uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */ + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ + smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ + smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ + smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ + smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ + smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ + smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ + smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ + smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ +.endm + +.macro do_yuv_to_rgb_stage2 + rshrn v20.4h, v20.4s, #15 + rshrn2 v20.8h, v22.4s, #15 + rshrn v24.4h, v24.4s, #14 + rshrn2 v24.8h, v26.4s, #14 + rshrn v28.4h, v28.4s, #14 + rshrn2 v28.8h, v30.4s, #14 + uaddw v20.8h, v20.8h, v0.8b + uaddw v24.8h, v24.8h, v0.8b + uaddw v28.8h, v28.8h, v0.8b + .if \bpp != 16 + sqxtun v1\g_offs\defsize, v20.8h + sqxtun v1\r_offs\defsize, v24.8h + sqxtun v1\b_offs\defsize, v28.8h + .else + sqshlu v21.8h, v20.8h, #8 + sqshlu v25.8h, v24.8h, #8 + sqshlu v29.8h, v28.8h, #8 + sri v25.8h, v21.8h, #5 + sri v25.8h, v29.8h, #11 + .endif +.endm + +.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3 + rshrn v20.4h, v20.4s, #15 + rshrn v24.4h, v24.4s, #14 + rshrn v28.4h, v28.4s, #14 + ld1 {v4.8b}, [U], 8 + rshrn2 v20.8h, v22.4s, #15 + rshrn2 v24.8h, v26.4s, #14 + rshrn2 v28.8h, v30.4s, #14 + ld1 {v5.8b}, [V], 8 + uaddw v20.8h, v20.8h, v0.8b + uaddw v24.8h, v24.8h, v0.8b + uaddw v28.8h, v28.8h, v0.8b + .if \bpp != 16 /**************** rgb24/rgb32 ******************************/ + sqxtun v1\g_offs\defsize, v20.8h + ld1 {v0.8b}, [Y], 8 + sqxtun v1\r_offs\defsize, v24.8h + prfm pldl1keep, [U, #64] + prfm pldl1keep, [V, #64] + prfm pldl1keep, [Y, #64] + sqxtun v1\b_offs\defsize, v28.8h + uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ + smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ + smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ + smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ + smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ + smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ + smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ + .else /**************************** rgb565 ********************************/ + sqshlu v21.8h, v20.8h, #8 + sqshlu v25.8h, v24.8h, #8 + sqshlu v29.8h, v28.8h, #8 + uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ + ld1 {v0.8b}, [Y], 8 + smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ + smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ + smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ + smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ + sri v25.8h, v21.8h, #5 + smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ + smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ + prfm pldl1keep, [U, #64] + prfm pldl1keep, [V, #64] + prfm pldl1keep, [Y, #64] + sri v25.8h, v29.8h, #11 + .endif + do_store \bpp, 8, \fast_st3 + smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ + smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ +.endm + +.macro do_yuv_to_rgb + do_yuv_to_rgb_stage1 + do_yuv_to_rgb_stage2 +.endm + +/* Apple gas crashes on adrl, work around that by using adr. + * But this requires a copy of these constants for each function. + */ + +.balign 16 +.if \fast_st3 == 1 +Ljsimd_ycc_\colorid\()_neon_consts: +.else +Ljsimd_ycc_\colorid\()_neon_slowst3_consts: +.endif + .short 0, 0, 0, 0 + .short 22971, -11277, -23401, 29033 + .short -128, -128, -128, -128 + .short -128, -128, -128, -128 + +.if \fast_st3 == 1 +asm_function jsimd_ycc_\colorid\()_convert_neon +.else +asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3 +.endif + OUTPUT_WIDTH .req w0 + INPUT_BUF .req x1 + INPUT_ROW .req w2 + OUTPUT_BUF .req x3 + NUM_ROWS .req w4 + + INPUT_BUF0 .req x5 + INPUT_BUF1 .req x6 + INPUT_BUF2 .req x1 + + RGB .req x7 + Y .req x9 + U .req x10 + V .req x11 + N .req w15 + + sub sp, sp, 64 + mov x9, sp + + /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */ + .if \fast_st3 == 1 + adr x15, Ljsimd_ycc_\colorid\()_neon_consts + .else + adr x15, Ljsimd_ycc_\colorid\()_neon_slowst3_consts + .endif + + /* Save NEON registers */ + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 + ld1 {v0.4h, v1.4h}, [x15], 16 + ld1 {v2.8h}, [x15] + + ldr INPUT_BUF0, [INPUT_BUF] + ldr INPUT_BUF1, [INPUT_BUF, #8] + ldr INPUT_BUF2, [INPUT_BUF, #16] + .unreq INPUT_BUF + + /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */ + movi v10.16b, #255 + movi v13.16b, #255 + + /* Outer loop over scanlines */ + cmp NUM_ROWS, #1 + b.lt 9f +0: + ldr Y, [INPUT_BUF0, INPUT_ROW, uxtw #3] + ldr U, [INPUT_BUF1, INPUT_ROW, uxtw #3] + mov N, OUTPUT_WIDTH + ldr V, [INPUT_BUF2, INPUT_ROW, uxtw #3] + add INPUT_ROW, INPUT_ROW, #1 + ldr RGB, [OUTPUT_BUF], #8 + + /* Inner loop over pixels */ + subs N, N, #8 + b.lt 3f + do_load 8 + do_yuv_to_rgb_stage1 + subs N, N, #8 + b.lt 2f +1: + do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3 + subs N, N, #8 + b.ge 1b +2: + do_yuv_to_rgb_stage2 + do_store \bpp, 8, \fast_st3 + tst N, #7 + b.eq 8f +3: + tst N, #4 + b.eq 3f + do_load 4 +3: + tst N, #2 + b.eq 4f + do_load 2 +4: + tst N, #1 + b.eq 5f + do_load 1 +5: + do_yuv_to_rgb + tst N, #4 + b.eq 6f + do_store \bpp, 4, \fast_st3 +6: + tst N, #2 + b.eq 7f + do_store \bpp, 2, \fast_st3 +7: + tst N, #1 + b.eq 8f + do_store \bpp, 1, \fast_st3 +8: + subs NUM_ROWS, NUM_ROWS, #1 + b.gt 0b +9: + /* Restore all registers and return */ + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + br x30 + .unreq OUTPUT_WIDTH + .unreq INPUT_ROW + .unreq OUTPUT_BUF + .unreq NUM_ROWS + .unreq INPUT_BUF0 + .unreq INPUT_BUF1 + .unreq INPUT_BUF2 + .unreq RGB + .unreq Y + .unreq U + .unreq V + .unreq N + +.purgem do_yuv_to_rgb +.purgem do_yuv_to_rgb_stage1 +.purgem do_yuv_to_rgb_stage2 +.purgem do_yuv_to_rgb_stage2_store_load_stage1 + +.endm + +/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize fast_st3*/ +generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 1 +generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 1 +generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b, 1 +generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b, 1 +generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b, 1 +generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b, 1 +generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b, 1 + +generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 0 +generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 0 + +.purgem do_load +.purgem do_store + + +/*****************************************************************************/ + +/* + * jsimd_extrgb_ycc_convert_neon + * jsimd_extbgr_ycc_convert_neon + * jsimd_extrgbx_ycc_convert_neon + * jsimd_extbgrx_ycc_convert_neon + * jsimd_extxbgr_ycc_convert_neon + * jsimd_extxrgb_ycc_convert_neon + * + * Colorspace conversion RGB -> YCbCr + */ + +.macro do_store size + .if \size == 8 + st1 {v20.8b}, [Y], #8 + st1 {v21.8b}, [U], #8 + st1 {v22.8b}, [V], #8 + .elseif \size == 4 + st1 {v20.b}[0], [Y], #1 + st1 {v20.b}[1], [Y], #1 + st1 {v20.b}[2], [Y], #1 + st1 {v20.b}[3], [Y], #1 + st1 {v21.b}[0], [U], #1 + st1 {v21.b}[1], [U], #1 + st1 {v21.b}[2], [U], #1 + st1 {v21.b}[3], [U], #1 + st1 {v22.b}[0], [V], #1 + st1 {v22.b}[1], [V], #1 + st1 {v22.b}[2], [V], #1 + st1 {v22.b}[3], [V], #1 + .elseif \size == 2 + st1 {v20.b}[4], [Y], #1 + st1 {v20.b}[5], [Y], #1 + st1 {v21.b}[4], [U], #1 + st1 {v21.b}[5], [U], #1 + st1 {v22.b}[4], [V], #1 + st1 {v22.b}[5], [V], #1 + .elseif \size == 1 + st1 {v20.b}[6], [Y], #1 + st1 {v21.b}[6], [U], #1 + st1 {v22.b}[6], [V], #1 + .else + .error unsupported macroblock size + .endif +.endm + +.macro do_load bpp, size, fast_ld3 + .if \bpp == 24 + .if \size == 8 + .if \fast_ld3 == 1 + ld3 {v10.8b, v11.8b, v12.8b}, [RGB], #24 + .else + ld1 {v10.b}[0], [RGB], #1 + ld1 {v11.b}[0], [RGB], #1 + ld1 {v12.b}[0], [RGB], #1 + + ld1 {v10.b}[1], [RGB], #1 + ld1 {v11.b}[1], [RGB], #1 + ld1 {v12.b}[1], [RGB], #1 + + ld1 {v10.b}[2], [RGB], #1 + ld1 {v11.b}[2], [RGB], #1 + ld1 {v12.b}[2], [RGB], #1 + + ld1 {v10.b}[3], [RGB], #1 + ld1 {v11.b}[3], [RGB], #1 + ld1 {v12.b}[3], [RGB], #1 + + ld1 {v10.b}[4], [RGB], #1 + ld1 {v11.b}[4], [RGB], #1 + ld1 {v12.b}[4], [RGB], #1 + + ld1 {v10.b}[5], [RGB], #1 + ld1 {v11.b}[5], [RGB], #1 + ld1 {v12.b}[5], [RGB], #1 + + ld1 {v10.b}[6], [RGB], #1 + ld1 {v11.b}[6], [RGB], #1 + ld1 {v12.b}[6], [RGB], #1 + + ld1 {v10.b}[7], [RGB], #1 + ld1 {v11.b}[7], [RGB], #1 + ld1 {v12.b}[7], [RGB], #1 + .endif + prfm pldl1keep, [RGB, #128] + .elseif \size == 4 + ld3 {v10.b, v11.b, v12.b}[0], [RGB], #3 + ld3 {v10.b, v11.b, v12.b}[1], [RGB], #3 + ld3 {v10.b, v11.b, v12.b}[2], [RGB], #3 + ld3 {v10.b, v11.b, v12.b}[3], [RGB], #3 + .elseif \size == 2 + ld3 {v10.b, v11.b, v12.b}[4], [RGB], #3 + ld3 {v10.b, v11.b, v12.b}[5], [RGB], #3 + .elseif \size == 1 + ld3 {v10.b, v11.b, v12.b}[6], [RGB], #3 + .else + .error unsupported macroblock size + .endif + .elseif \bpp == 32 + .if \size == 8 + ld4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32 + prfm pldl1keep, [RGB, #128] + .elseif \size == 4 + ld4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4 + ld4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4 + ld4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4 + ld4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4 + .elseif \size == 2 + ld4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4 + ld4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4 + .elseif \size == 1 + ld4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4 + .else + .error unsupported macroblock size + .endif + .else + .error unsupported bpp + .endif +.endm + +.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \ + b_offs, fast_ld3 + +/* + * 2-stage pipelined RGB->YCbCr conversion + */ + +.macro do_rgb_to_yuv_stage1 + ushll v4.8h, v1\r_offs\().8b, #0 /* r = v4 */ + ushll v6.8h, v1\g_offs\().8b, #0 /* g = v6 */ + ushll v8.8h, v1\b_offs\().8b, #0 /* b = v8 */ + rev64 v18.4s, v1.4s + rev64 v26.4s, v1.4s + rev64 v28.4s, v1.4s + rev64 v30.4s, v1.4s + umull v14.4s, v4.4h, v0.h[0] + umull2 v16.4s, v4.8h, v0.h[0] + umlsl v18.4s, v4.4h, v0.h[3] + umlsl2 v26.4s, v4.8h, v0.h[3] + umlal v28.4s, v4.4h, v0.h[5] + umlal2 v30.4s, v4.8h, v0.h[5] + umlal v14.4s, v6.4h, v0.h[1] + umlal2 v16.4s, v6.8h, v0.h[1] + umlsl v18.4s, v6.4h, v0.h[4] + umlsl2 v26.4s, v6.8h, v0.h[4] + umlsl v28.4s, v6.4h, v0.h[6] + umlsl2 v30.4s, v6.8h, v0.h[6] + umlal v14.4s, v8.4h, v0.h[2] + umlal2 v16.4s, v8.8h, v0.h[2] + umlal v18.4s, v8.4h, v0.h[5] + umlal2 v26.4s, v8.8h, v0.h[5] + umlsl v28.4s, v8.4h, v0.h[7] + umlsl2 v30.4s, v8.8h, v0.h[7] +.endm + +.macro do_rgb_to_yuv_stage2 + rshrn v20.4h, v14.4s, #16 + shrn v22.4h, v18.4s, #16 + shrn v24.4h, v28.4s, #16 + rshrn2 v20.8h, v16.4s, #16 + shrn2 v22.8h, v26.4s, #16 + shrn2 v24.8h, v30.4s, #16 + xtn v20.8b, v20.8h /* v20 = y */ + xtn v21.8b, v22.8h /* v21 = u */ + xtn v22.8b, v24.8h /* v22 = v */ +.endm + +.macro do_rgb_to_yuv + do_rgb_to_yuv_stage1 + do_rgb_to_yuv_stage2 +.endm + +/* TODO: expand macros and interleave instructions if some in-order + * ARM64 processor actually can dual-issue LOAD/STORE with ALU */ +.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3 + do_rgb_to_yuv_stage2 + do_load \bpp, 8, \fast_ld3 + st1 {v20.8b}, [Y], #8 + st1 {v21.8b}, [U], #8 + st1 {v22.8b}, [V], #8 + do_rgb_to_yuv_stage1 +.endm + +.balign 16 +.if \fast_ld3 == 1 +Ljsimd_\colorid\()_ycc_neon_consts: +.else +Ljsimd_\colorid\()_ycc_neon_slowld3_consts: +.endif + .short 19595, 38470, 7471, 11059 + .short 21709, 32768, 27439, 5329 + .short 32767, 128, 32767, 128 + .short 32767, 128, 32767, 128 + +.if \fast_ld3 == 1 +asm_function jsimd_\colorid\()_ycc_convert_neon +.else +asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3 +.endif + OUTPUT_WIDTH .req w0 + INPUT_BUF .req x1 + OUTPUT_BUF .req x2 + OUTPUT_ROW .req w3 + NUM_ROWS .req w4 + + OUTPUT_BUF0 .req x5 + OUTPUT_BUF1 .req x6 + OUTPUT_BUF2 .req x2 /* OUTPUT_BUF */ + + RGB .req x7 + Y .req x9 + U .req x10 + V .req x11 + N .req w12 + + /* Load constants to d0, d1, d2, d3 */ + .if \fast_ld3 == 1 + adr x13, Ljsimd_\colorid\()_ycc_neon_consts + .else + adr x13, Ljsimd_\colorid\()_ycc_neon_slowld3_consts + .endif + ld1 {v0.8h, v1.8h}, [x13] + + ldr OUTPUT_BUF0, [OUTPUT_BUF] + ldr OUTPUT_BUF1, [OUTPUT_BUF, #8] + ldr OUTPUT_BUF2, [OUTPUT_BUF, #16] + .unreq OUTPUT_BUF + + /* Save NEON registers */ + sub sp, sp, #64 + mov x9, sp + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 + + /* Outer loop over scanlines */ + cmp NUM_ROWS, #1 + b.lt 9f +0: + ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3] + ldr U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3] + mov N, OUTPUT_WIDTH + ldr V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3] + add OUTPUT_ROW, OUTPUT_ROW, #1 + ldr RGB, [INPUT_BUF], #8 + + /* Inner loop over pixels */ + subs N, N, #8 + b.lt 3f + do_load \bpp, 8, \fast_ld3 + do_rgb_to_yuv_stage1 + subs N, N, #8 + b.lt 2f +1: + do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3 + subs N, N, #8 + b.ge 1b +2: + do_rgb_to_yuv_stage2 + do_store 8 + tst N, #7 + b.eq 8f +3: + tbz N, #2, 3f + do_load \bpp, 4, \fast_ld3 +3: + tbz N, #1, 4f + do_load \bpp, 2, \fast_ld3 +4: + tbz N, #0, 5f + do_load \bpp, 1, \fast_ld3 +5: + do_rgb_to_yuv + tbz N, #2, 6f + do_store 4 +6: + tbz N, #1, 7f + do_store 2 +7: + tbz N, #0, 8f + do_store 1 +8: + subs NUM_ROWS, NUM_ROWS, #1 + b.gt 0b +9: + /* Restore all registers and return */ + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + br x30 + + .unreq OUTPUT_WIDTH + .unreq OUTPUT_ROW + .unreq INPUT_BUF + .unreq NUM_ROWS + .unreq OUTPUT_BUF0 + .unreq OUTPUT_BUF1 + .unreq OUTPUT_BUF2 + .unreq RGB + .unreq Y + .unreq U + .unreq V + .unreq N + +.purgem do_rgb_to_yuv +.purgem do_rgb_to_yuv_stage1 +.purgem do_rgb_to_yuv_stage2 +.purgem do_rgb_to_yuv_stage2_store_load_stage1 + +.endm + +/*--------------------------------- id ----- bpp R G B Fast LD3 */ +generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 1 +generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 1 +generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1 +generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1 +generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1 +generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1 + +generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 0 +generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 0 + +.purgem do_load +.purgem do_store + + +/*****************************************************************************/ + +/* + * Load data into workspace, applying unsigned->signed conversion + * + * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get + * rid of VST1.16 instructions + */ + +asm_function jsimd_convsamp_neon + SAMPLE_DATA .req x0 + START_COL .req x1 + WORKSPACE .req x2 + TMP1 .req x9 + TMP2 .req x10 + TMP3 .req x11 + TMP4 .req x12 + TMP5 .req x13 + TMP6 .req x14 + TMP7 .req x15 + TMP8 .req x4 + TMPDUP .req w3 + + /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't + guarantee that the upper (unused) 32 bits of x1 are valid. This + instruction ensures that those bits are set to zero. */ + uxtw x1, w1 + + mov TMPDUP, #128 + ldp TMP1, TMP2, [SAMPLE_DATA], 16 + ldp TMP3, TMP4, [SAMPLE_DATA], 16 + dup v0.8b, TMPDUP + add TMP1, TMP1, START_COL + add TMP2, TMP2, START_COL + ldp TMP5, TMP6, [SAMPLE_DATA], 16 + add TMP3, TMP3, START_COL + add TMP4, TMP4, START_COL + ldp TMP7, TMP8, [SAMPLE_DATA], 16 + add TMP5, TMP5, START_COL + add TMP6, TMP6, START_COL + ld1 {v16.8b}, [TMP1] + add TMP7, TMP7, START_COL + add TMP8, TMP8, START_COL + ld1 {v17.8b}, [TMP2] + usubl v16.8h, v16.8b, v0.8b + ld1 {v18.8b}, [TMP3] + usubl v17.8h, v17.8b, v0.8b + ld1 {v19.8b}, [TMP4] + usubl v18.8h, v18.8b, v0.8b + ld1 {v20.8b}, [TMP5] + usubl v19.8h, v19.8b, v0.8b + ld1 {v21.8b}, [TMP6] + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64 + usubl v20.8h, v20.8b, v0.8b + ld1 {v22.8b}, [TMP7] + usubl v21.8h, v21.8b, v0.8b + ld1 {v23.8b}, [TMP8] + usubl v22.8h, v22.8b, v0.8b + usubl v23.8h, v23.8b, v0.8b + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64 + + br x30 + + .unreq SAMPLE_DATA + .unreq START_COL + .unreq WORKSPACE + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + .unreq TMP5 + .unreq TMP6 + .unreq TMP7 + .unreq TMP8 + .unreq TMPDUP + +/*****************************************************************************/ + +/* + * jsimd_fdct_islow_neon + * + * This file contains a slow-but-accurate integer implementation of the + * forward DCT (Discrete Cosine Transform). The following code is based + * directly on the IJG''s original jfdctint.c; see the jfdctint.c for + * more details. + * + * TODO: can be combined with 'jsimd_convsamp_neon' to get + * rid of a bunch of VLD1.16 instructions + */ + +#define CONST_BITS 13 +#define PASS1_BITS 2 + +#define DESCALE_P1 (CONST_BITS-PASS1_BITS) +#define DESCALE_P2 (CONST_BITS+PASS1_BITS) + +#define F_0_298 2446 /* FIX(0.298631336) */ +#define F_0_390 3196 /* FIX(0.390180644) */ +#define F_0_541 4433 /* FIX(0.541196100) */ +#define F_0_765 6270 /* FIX(0.765366865) */ +#define F_0_899 7373 /* FIX(0.899976223) */ +#define F_1_175 9633 /* FIX(1.175875602) */ +#define F_1_501 12299 /* FIX(1.501321110) */ +#define F_1_847 15137 /* FIX(1.847759065) */ +#define F_1_961 16069 /* FIX(1.961570560) */ +#define F_2_053 16819 /* FIX(2.053119869) */ +#define F_2_562 20995 /* FIX(2.562915447) */ +#define F_3_072 25172 /* FIX(3.072711026) */ + +.balign 16 +Ljsimd_fdct_islow_neon_consts: + .short F_0_298 + .short -F_0_390 + .short F_0_541 + .short F_0_765 + .short - F_0_899 + .short F_1_175 + .short F_1_501 + .short - F_1_847 + .short - F_1_961 + .short F_2_053 + .short - F_2_562 + .short F_3_072 + .short 0 /* padding */ + .short 0 + .short 0 + .short 0 + +#undef F_0_298 +#undef F_0_390 +#undef F_0_541 +#undef F_0_765 +#undef F_0_899 +#undef F_1_175 +#undef F_1_501 +#undef F_1_847 +#undef F_1_961 +#undef F_2_053 +#undef F_2_562 +#undef F_3_072 +#define XFIX_P_0_298 v0.h[0] +#define XFIX_N_0_390 v0.h[1] +#define XFIX_P_0_541 v0.h[2] +#define XFIX_P_0_765 v0.h[3] +#define XFIX_N_0_899 v0.h[4] +#define XFIX_P_1_175 v0.h[5] +#define XFIX_P_1_501 v0.h[6] +#define XFIX_N_1_847 v0.h[7] +#define XFIX_N_1_961 v1.h[0] +#define XFIX_P_2_053 v1.h[1] +#define XFIX_N_2_562 v1.h[2] +#define XFIX_P_3_072 v1.h[3] + +asm_function jsimd_fdct_islow_neon + + DATA .req x0 + TMP .req x9 + + /* Load constants */ + adr TMP, Ljsimd_fdct_islow_neon_consts + ld1 {v0.8h, v1.8h}, [TMP] + + /* Save NEON registers */ + sub sp, sp, #64 + mov x10, sp + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32 + + /* Load all DATA into NEON registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 | v16.8h + * 1 | d18 | d19 | v17.8h + * 2 | d20 | d21 | v18.8h + * 3 | d22 | d23 | v19.8h + * 4 | d24 | d25 | v20.8h + * 5 | d26 | d27 | v21.8h + * 6 | d28 | d29 | v22.8h + * 7 | d30 | d31 | v23.8h + */ + + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] + sub DATA, DATA, #64 + + /* Transpose */ + transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4 + /* 1-D FDCT */ + add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */ + sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */ + add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */ + sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */ + add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */ + sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */ + add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */ + sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */ + + /* even part */ + + add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ + sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ + add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ + sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ + + add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */ + sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */ + + add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */ + + shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */ + shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */ + + smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ + smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ + mov v22.16b, v18.16b + mov v25.16b, v24.16b + + smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ + smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ + smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ + smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ + + rshrn v18.4h, v18.4s, #DESCALE_P1 + rshrn v22.4h, v22.4s, #DESCALE_P1 + rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */ + rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */ + + /* Odd part */ + + add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ + add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ + add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ + add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ + smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */ + smull2 v5.4s, v10.8h, XFIX_P_1_175 + smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */ + smlal2 v5.4s, v11.8h, XFIX_P_1_175 + + smull2 v24.4s, v28.8h, XFIX_P_0_298 + smull2 v25.4s, v29.8h, XFIX_P_2_053 + smull2 v26.4s, v30.8h, XFIX_P_3_072 + smull2 v27.4s, v31.8h, XFIX_P_1_501 + smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */ + smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */ + smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */ + smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */ + + smull2 v12.4s, v8.8h, XFIX_N_0_899 + smull2 v13.4s, v9.8h, XFIX_N_2_562 + smull2 v14.4s, v10.8h, XFIX_N_1_961 + smull2 v15.4s, v11.8h, XFIX_N_0_390 + smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223); */ + smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447); */ + smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560); */ + smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644); */ + + add v10.4s, v10.4s, v4.4s /* z3 += z5 */ + add v14.4s, v14.4s, v5.4s + add v11.4s, v11.4s, v4.4s /* z4 += z5 */ + add v15.4s, v15.4s, v5.4s + + add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */ + add v24.4s, v24.4s, v12.4s + add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */ + add v25.4s, v25.4s, v13.4s + add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */ + add v26.4s, v26.4s, v14.4s + add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */ + add v27.4s, v27.4s, v15.4s + + add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */ + add v24.4s, v24.4s, v14.4s + add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */ + add v25.4s, v25.4s, v15.4s + add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */ + add v26.4s, v26.4s, v13.4s + add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */ + add v27.4s, v27.4s, v12.4s + + rshrn v23.4h, v28.4s, #DESCALE_P1 + rshrn v21.4h, v29.4s, #DESCALE_P1 + rshrn v19.4h, v30.4s, #DESCALE_P1 + rshrn v17.4h, v31.4s, #DESCALE_P1 + rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ + rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ + rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ + rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ + + /* Transpose */ + transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4 + + /* 1-D FDCT */ + add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */ + sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */ + add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */ + sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */ + add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */ + sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */ + add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */ + sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */ + + /* even part */ + add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ + sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ + add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ + sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ + + add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */ + sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */ + + add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */ + + srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS); */ + srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS); */ + + smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ + smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ + mov v22.16b, v18.16b + mov v25.16b, v24.16b + + smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ + smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ + smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ + smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ + + rshrn v18.4h, v18.4s, #DESCALE_P2 + rshrn v22.4h, v22.4s, #DESCALE_P2 + rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */ + rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */ + + /* Odd part */ + add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ + add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ + add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ + add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ + + smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */ + smull2 v5.4s, v10.8h, XFIX_P_1_175 + smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */ + smlal2 v5.4s, v11.8h, XFIX_P_1_175 + + smull2 v24.4s, v28.8h, XFIX_P_0_298 + smull2 v25.4s, v29.8h, XFIX_P_2_053 + smull2 v26.4s, v30.8h, XFIX_P_3_072 + smull2 v27.4s, v31.8h, XFIX_P_1_501 + smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */ + smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */ + smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */ + smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */ + + smull2 v12.4s, v8.8h, XFIX_N_0_899 + smull2 v13.4s, v9.8h, XFIX_N_2_562 + smull2 v14.4s, v10.8h, XFIX_N_1_961 + smull2 v15.4s, v11.8h, XFIX_N_0_390 + smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223); */ + smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447); */ + smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560); */ + smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644); */ + + add v10.4s, v10.4s, v4.4s + add v14.4s, v14.4s, v5.4s + add v11.4s, v11.4s, v4.4s + add v15.4s, v15.4s, v5.4s + + add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */ + add v24.4s, v24.4s, v12.4s + add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */ + add v25.4s, v25.4s, v13.4s + add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */ + add v26.4s, v26.4s, v14.4s + add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */ + add v27.4s, v27.4s, v15.4s + + add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */ + add v24.4s, v24.4s, v14.4s + add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */ + add v25.4s, v25.4s, v15.4s + add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */ + add v26.4s, v26.4s, v13.4s + add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */ + add v27.4s, v27.4s, v12.4s + + rshrn v23.4h, v28.4s, #DESCALE_P2 + rshrn v21.4h, v29.4s, #DESCALE_P2 + rshrn v19.4h, v30.4s, #DESCALE_P2 + rshrn v17.4h, v31.4s, #DESCALE_P2 + rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ + rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ + rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ + rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ + + /* store results */ + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] + + /* Restore NEON registers */ + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + + br x30 + + .unreq DATA + .unreq TMP + +#undef XFIX_P_0_298 +#undef XFIX_N_0_390 +#undef XFIX_P_0_541 +#undef XFIX_P_0_765 +#undef XFIX_N_0_899 +#undef XFIX_P_1_175 +#undef XFIX_P_1_501 +#undef XFIX_N_1_847 +#undef XFIX_N_1_961 +#undef XFIX_P_2_053 +#undef XFIX_N_2_562 +#undef XFIX_P_3_072 + + +/*****************************************************************************/ + +/* + * jsimd_fdct_ifast_neon + * + * This function contains a fast, not so accurate integer implementation of + * the forward DCT (Discrete Cosine Transform). It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' + * function from jfdctfst.c + * + * TODO: can be combined with 'jsimd_convsamp_neon' to get + * rid of a bunch of VLD1.16 instructions + */ + +#undef XFIX_0_541196100 +#define XFIX_0_382683433 v0.h[0] +#define XFIX_0_541196100 v0.h[1] +#define XFIX_0_707106781 v0.h[2] +#define XFIX_1_306562965 v0.h[3] + +.balign 16 +Ljsimd_fdct_ifast_neon_consts: + .short (98 * 128) /* XFIX_0_382683433 */ + .short (139 * 128) /* XFIX_0_541196100 */ + .short (181 * 128) /* XFIX_0_707106781 */ + .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ + +asm_function jsimd_fdct_ifast_neon + + DATA .req x0 + TMP .req x9 + + /* Load constants */ + adr TMP, Ljsimd_fdct_ifast_neon_consts + ld1 {v0.4h}, [TMP] + + /* Load all DATA into NEON registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 | v0.8h + * 1 | d18 | d19 | q9 + * 2 | d20 | d21 | q10 + * 3 | d22 | d23 | q11 + * 4 | d24 | d25 | q12 + * 5 | d26 | d27 | q13 + * 6 | d28 | d29 | q14 + * 7 | d30 | d31 | q15 + */ + + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] + mov TMP, #2 + sub DATA, DATA, #64 +1: + /* Transpose */ + transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4 + subs TMP, TMP, #1 + /* 1-D FDCT */ + add v4.8h, v19.8h, v20.8h + sub v20.8h, v19.8h, v20.8h + sub v28.8h, v18.8h, v21.8h + add v18.8h, v18.8h, v21.8h + sub v29.8h, v17.8h, v22.8h + add v17.8h, v17.8h, v22.8h + sub v21.8h, v16.8h, v23.8h + add v16.8h, v16.8h, v23.8h + sub v6.8h, v17.8h, v18.8h + sub v7.8h, v16.8h, v4.8h + add v5.8h, v17.8h, v18.8h + add v6.8h, v6.8h, v7.8h + add v4.8h, v16.8h, v4.8h + sqdmulh v6.8h, v6.8h, XFIX_0_707106781 + add v19.8h, v20.8h, v28.8h + add v16.8h, v4.8h, v5.8h + sub v20.8h, v4.8h, v5.8h + add v5.8h, v28.8h, v29.8h + add v29.8h, v29.8h, v21.8h + sqdmulh v5.8h, v5.8h, XFIX_0_707106781 + sub v28.8h, v19.8h, v29.8h + add v18.8h, v7.8h, v6.8h + sqdmulh v28.8h, v28.8h, XFIX_0_382683433 + sub v22.8h, v7.8h, v6.8h + sqdmulh v19.8h, v19.8h, XFIX_0_541196100 + sqdmulh v7.8h, v29.8h, XFIX_1_306562965 + add v6.8h, v21.8h, v5.8h + sub v5.8h, v21.8h, v5.8h + add v29.8h, v29.8h, v28.8h + add v19.8h, v19.8h, v28.8h + add v29.8h, v29.8h, v7.8h + add v21.8h, v5.8h, v19.8h + sub v19.8h, v5.8h, v19.8h + add v17.8h, v6.8h, v29.8h + sub v23.8h, v6.8h, v29.8h + + b.ne 1b + + /* store results */ + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] + + br x30 + + .unreq DATA + .unreq TMP +#undef XFIX_0_382683433 +#undef XFIX_0_541196100 +#undef XFIX_0_707106781 +#undef XFIX_1_306562965 + + +/*****************************************************************************/ + +/* + * GLOBAL(void) + * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors, + * DCTELEM *workspace); + * + */ +asm_function jsimd_quantize_neon + + COEF_BLOCK .req x0 + DIVISORS .req x1 + WORKSPACE .req x2 + + RECIPROCAL .req DIVISORS + CORRECTION .req x9 + SHIFT .req x10 + LOOP_COUNT .req x11 + + mov LOOP_COUNT, #2 + add CORRECTION, DIVISORS, #(64 * 2) + add SHIFT, DIVISORS, #(64 * 6) +1: + subs LOOP_COUNT, LOOP_COUNT, #1 + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64 + abs v20.8h, v0.8h + abs v21.8h, v1.8h + abs v22.8h, v2.8h + abs v23.8h, v3.8h + ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64 + add v20.8h, v20.8h, v4.8h /* add correction */ + add v21.8h, v21.8h, v5.8h + add v22.8h, v22.8h, v6.8h + add v23.8h, v23.8h, v7.8h + umull v4.4s, v20.4h, v28.4h /* multiply by reciprocal */ + umull2 v16.4s, v20.8h, v28.8h + umull v5.4s, v21.4h, v29.4h + umull2 v17.4s, v21.8h, v29.8h + umull v6.4s, v22.4h, v30.4h /* multiply by reciprocal */ + umull2 v18.4s, v22.8h, v30.8h + umull v7.4s, v23.4h, v31.4h + umull2 v19.4s, v23.8h, v31.8h + ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64 + shrn v4.4h, v4.4s, #16 + shrn v5.4h, v5.4s, #16 + shrn v6.4h, v6.4s, #16 + shrn v7.4h, v7.4s, #16 + shrn2 v4.8h, v16.4s, #16 + shrn2 v5.8h, v17.4s, #16 + shrn2 v6.8h, v18.4s, #16 + shrn2 v7.8h, v19.4s, #16 + neg v24.8h, v24.8h + neg v25.8h, v25.8h + neg v26.8h, v26.8h + neg v27.8h, v27.8h + sshr v0.8h, v0.8h, #15 /* extract sign */ + sshr v1.8h, v1.8h, #15 + sshr v2.8h, v2.8h, #15 + sshr v3.8h, v3.8h, #15 + ushl v4.8h, v4.8h, v24.8h /* shift */ + ushl v5.8h, v5.8h, v25.8h + ushl v6.8h, v6.8h, v26.8h + ushl v7.8h, v7.8h, v27.8h + + eor v4.16b, v4.16b, v0.16b /* restore sign */ + eor v5.16b, v5.16b, v1.16b + eor v6.16b, v6.16b, v2.16b + eor v7.16b, v7.16b, v3.16b + sub v4.8h, v4.8h, v0.8h + sub v5.8h, v5.8h, v1.8h + sub v6.8h, v6.8h, v2.8h + sub v7.8h, v7.8h, v3.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64 + + b.ne 1b + + br x30 /* return */ + + .unreq COEF_BLOCK + .unreq DIVISORS + .unreq WORKSPACE + .unreq RECIPROCAL + .unreq CORRECTION + .unreq SHIFT + .unreq LOOP_COUNT + + +/*****************************************************************************/ + +/* + * Downsample pixel values of a single component. + * This version handles the common case of 2:1 horizontal and 1:1 vertical, + * without smoothing. + * + * GLOBAL(void) + * jsimd_h2v1_downsample_neon (JDIMENSION image_width, int max_v_samp_factor, + * JDIMENSION v_samp_factor, + * JDIMENSION width_blocks, JSAMPARRAY input_data, + * JSAMPARRAY output_data); + */ + +.balign 16 +Ljsimd_h2_downsample_neon_consts: + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F /* diff 0 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E /* diff 1 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D /* diff 2 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C /* diff 3 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B /* diff 4 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A /* diff 5 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 /* diff 6 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 /* diff 7 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \ + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \ + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \ + 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11 */ + .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \ + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12 */ + .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \ + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13 */ + .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \ + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14 */ + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15 */ + +asm_function jsimd_h2v1_downsample_neon + IMAGE_WIDTH .req x0 + MAX_V_SAMP .req x1 + V_SAMP .req x2 + BLOCK_WIDTH .req x3 + INPUT_DATA .req x4 + OUTPUT_DATA .req x5 + OUTPTR .req x9 + INPTR .req x10 + TMP1 .req x11 + TMP2 .req x12 + TMP3 .req x13 + TMPDUP .req w15 + + mov TMPDUP, #0x10000 + lsl TMP2, BLOCK_WIDTH, #4 + sub TMP2, TMP2, IMAGE_WIDTH + adr TMP3, Ljsimd_h2_downsample_neon_consts + add TMP3, TMP3, TMP2, lsl #4 + dup v16.4s, TMPDUP + ld1 {v18.16b}, [TMP3] + +1: /* row loop */ + ldr INPTR, [INPUT_DATA], #8 + ldr OUTPTR, [OUTPUT_DATA], #8 + subs TMP1, BLOCK_WIDTH, #1 + b.eq 3f +2: /* columns */ + ld1 {v0.16b}, [INPTR], #16 + mov v4.16b, v16.16b + subs TMP1, TMP1, #1 + uadalp v4.8h, v0.16b + shrn v6.8b, v4.8h, #1 + st1 {v6.8b}, [OUTPTR], #8 + b.ne 2b +3: /* last columns */ + ld1 {v0.16b}, [INPTR] + mov v4.16b, v16.16b + subs V_SAMP, V_SAMP, #1 + /* expand right */ + tbl v2.16b, {v0.16b}, v18.16b + uadalp v4.8h, v2.16b + shrn v6.8b, v4.8h, #1 + st1 {v6.8b}, [OUTPTR], #8 + b.ne 1b + + br x30 + + .unreq IMAGE_WIDTH + .unreq MAX_V_SAMP + .unreq V_SAMP + .unreq BLOCK_WIDTH + .unreq INPUT_DATA + .unreq OUTPUT_DATA + .unreq OUTPTR + .unreq INPTR + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMPDUP + + +/*****************************************************************************/ + +/* + * Downsample pixel values of a single component. + * This version handles the common case of 2:1 horizontal and 2:1 vertical, + * without smoothing. + * + * GLOBAL(void) + * jsimd_h2v2_downsample_neon (JDIMENSION image_width, int max_v_samp_factor, + * JDIMENSION v_samp_factor, JDIMENSION width_blocks, + * JSAMPARRAY input_data, JSAMPARRAY output_data); + */ + +.balign 16 +asm_function jsimd_h2v2_downsample_neon + IMAGE_WIDTH .req x0 + MAX_V_SAMP .req x1 + V_SAMP .req x2 + BLOCK_WIDTH .req x3 + INPUT_DATA .req x4 + OUTPUT_DATA .req x5 + OUTPTR .req x9 + INPTR0 .req x10 + INPTR1 .req x14 + TMP1 .req x11 + TMP2 .req x12 + TMP3 .req x13 + TMPDUP .req w15 + + mov TMPDUP, #1 + lsl TMP2, BLOCK_WIDTH, #4 + lsl TMPDUP, TMPDUP, #17 + sub TMP2, TMP2, IMAGE_WIDTH + adr TMP3, Ljsimd_h2_downsample_neon_consts + orr TMPDUP, TMPDUP, #1 + add TMP3, TMP3, TMP2, lsl #4 + dup v16.4s, TMPDUP + ld1 {v18.16b}, [TMP3] + +1: /* row loop */ + ldr INPTR0, [INPUT_DATA], #8 + ldr OUTPTR, [OUTPUT_DATA], #8 + ldr INPTR1, [INPUT_DATA], #8 + subs TMP1, BLOCK_WIDTH, #1 + b.eq 3f +2: /* columns */ + ld1 {v0.16b}, [INPTR0], #16 + ld1 {v1.16b}, [INPTR1], #16 + mov v4.16b, v16.16b + subs TMP1, TMP1, #1 + uadalp v4.8h, v0.16b + uadalp v4.8h, v1.16b + shrn v6.8b, v4.8h, #2 + st1 {v6.8b}, [OUTPTR], #8 + b.ne 2b +3: /* last columns */ + ld1 {v0.16b}, [INPTR0], #16 + ld1 {v1.16b}, [INPTR1], #16 + mov v4.16b, v16.16b + subs V_SAMP, V_SAMP, #1 + /* expand right */ + tbl v2.16b, {v0.16b}, v18.16b + tbl v3.16b, {v1.16b}, v18.16b + uadalp v4.8h, v2.16b + uadalp v4.8h, v3.16b + shrn v6.8b, v4.8h, #2 + st1 {v6.8b}, [OUTPTR], #8 + b.ne 1b + + br x30 + + .unreq IMAGE_WIDTH + .unreq MAX_V_SAMP + .unreq V_SAMP + .unreq BLOCK_WIDTH + .unreq INPUT_DATA + .unreq OUTPUT_DATA + .unreq OUTPTR + .unreq INPTR0 + .unreq INPTR1 + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMPDUP + + +/*****************************************************************************/ + +/* + * GLOBAL(JOCTET*) + * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer, + * JCOEFPTR block, int last_dc_val, + * c_derived_tbl *dctbl, c_derived_tbl *actbl) + * + */ + + BUFFER .req x1 + PUT_BUFFER .req x6 + PUT_BITS .req x7 + PUT_BITSw .req w7 + +.macro emit_byte + sub PUT_BITS, PUT_BITS, #0x8 + lsr x19, PUT_BUFFER, PUT_BITS + uxtb w19, w19 + strb w19, [BUFFER, #1]! + cmp w19, #0xff + b.ne 14f + strb wzr, [BUFFER, #1]! +14: +.endm +.macro put_bits CODE, SIZE + lsl PUT_BUFFER, PUT_BUFFER, \SIZE + add PUT_BITS, PUT_BITS, \SIZE + orr PUT_BUFFER, PUT_BUFFER, \CODE +.endm +.macro checkbuf31 + cmp PUT_BITS, #0x20 + b.lt 31f + emit_byte + emit_byte + emit_byte + emit_byte +31: +.endm +.macro checkbuf47 + cmp PUT_BITS, #0x30 + b.lt 47f + emit_byte + emit_byte + emit_byte + emit_byte + emit_byte + emit_byte +47: +.endm + +.macro generate_jsimd_huff_encode_one_block fast_tbl + +.balign 16 +.if \fast_tbl == 1 +Ljsimd_huff_encode_one_block_neon_consts: +.else +Ljsimd_huff_encode_one_block_neon_slowtbl_consts: +.endif + .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \ + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 +.if \fast_tbl == 1 + .byte 0, 1, 2, 3, 16, 17, 32, 33, \ + 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */ + .byte 34, 35, 48, 49, 255, 255, 50, 51, \ + 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */ + .byte 8, 9, 22, 23, 36, 37, 50, 51, \ + 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */ + .byte 54, 55, 40, 41, 26, 27, 12, 13, \ + 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */ + .byte 6, 7, 20, 21, 34, 35, 48, 49, \ + 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */ + .byte 42, 43, 28, 29, 14, 15, 30, 31, \ + 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */ + .byte 255, 255, 255, 255, 56, 57, 42, 43, \ + 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */ + .byte 26, 27, 40, 41, 42, 43, 28, 29, \ + 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */ + .byte 255, 255, 255, 255, 0, 1, 255, 255, \ + 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */ + .byte 255, 255, 255, 255, 255, 255, 255, 255, \ + 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */ + .byte 255, 255, 255, 255, 255, 255, 255, 255, \ + 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */ + .byte 4, 5, 6, 7, 255, 255, 255, 255, \ + 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */ +.endif + +.if \fast_tbl == 1 +asm_function jsimd_huff_encode_one_block_neon +.else +asm_function jsimd_huff_encode_one_block_neon_slowtbl +.endif + sub sp, sp, 272 + sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */ + /* Save ARM registers */ + stp x19, x20, [sp] +.if \fast_tbl == 1 + adr x15, Ljsimd_huff_encode_one_block_neon_consts +.else + adr x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts +.endif + ldr PUT_BUFFER, [x0, #0x10] + ldr PUT_BITSw, [x0, #0x18] + ldrsh w12, [x2] /* load DC coeff in w12 */ + /* prepare data */ +.if \fast_tbl == 1 + ld1 {v23.16b}, [x15], #16 + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64 + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64 + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64 + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64 + ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64 + sub w12, w12, w3 /* last_dc_val, not used afterwards */ + /* ZigZag 8x8 */ + tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b + tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b + tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b + tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b + tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b + tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b + tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b + tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b + ins v0.h[0], w12 + tbx v1.16b, {v28.16b}, v16.16b + tbx v2.16b, {v29.16b, v30.16b}, v17.16b + tbx v5.16b, {v29.16b, v30.16b}, v18.16b + tbx v6.16b, {v31.16b}, v19.16b +.else + add x13, x2, #0x22 + sub w12, w12, w3 /* last_dc_val, not used afterwards */ + ld1 {v23.16b}, [x15] + add x14, x2, #0x18 + add x3, x2, #0x36 + ins v0.h[0], w12 + add x9, x2, #0x2 + ld1 {v1.h}[0], [x13] + add x15, x2, #0x30 + ld1 {v2.h}[0], [x14] + add x19, x2, #0x26 + ld1 {v3.h}[0], [x3] + add x20, x2, #0x28 + ld1 {v0.h}[1], [x9] + add x12, x2, #0x10 + ld1 {v1.h}[1], [x15] + add x13, x2, #0x40 + ld1 {v2.h}[1], [x19] + add x14, x2, #0x34 + ld1 {v3.h}[1], [x20] + add x3, x2, #0x1a + ld1 {v0.h}[2], [x12] + add x9, x2, #0x20 + ld1 {v1.h}[2], [x13] + add x15, x2, #0x32 + ld1 {v2.h}[2], [x14] + add x19, x2, #0x42 + ld1 {v3.h}[2], [x3] + add x20, x2, #0xc + ld1 {v0.h}[3], [x9] + add x12, x2, #0x12 + ld1 {v1.h}[3], [x15] + add x13, x2, #0x24 + ld1 {v2.h}[3], [x19] + add x14, x2, #0x50 + ld1 {v3.h}[3], [x20] + add x3, x2, #0xe + ld1 {v0.h}[4], [x12] + add x9, x2, #0x4 + ld1 {v1.h}[4], [x13] + add x15, x2, #0x16 + ld1 {v2.h}[4], [x14] + add x19, x2, #0x60 + ld1 {v3.h}[4], [x3] + add x20, x2, #0x1c + ld1 {v0.h}[5], [x9] + add x12, x2, #0x6 + ld1 {v1.h}[5], [x15] + add x13, x2, #0x8 + ld1 {v2.h}[5], [x19] + add x14, x2, #0x52 + ld1 {v3.h}[5], [x20] + add x3, x2, #0x2a + ld1 {v0.h}[6], [x12] + add x9, x2, #0x14 + ld1 {v1.h}[6], [x13] + add x15, x2, #0xa + ld1 {v2.h}[6], [x14] + add x19, x2, #0x44 + ld1 {v3.h}[6], [x3] + add x20, x2, #0x38 + ld1 {v0.h}[7], [x9] + add x12, x2, #0x46 + ld1 {v1.h}[7], [x15] + add x13, x2, #0x3a + ld1 {v2.h}[7], [x19] + add x14, x2, #0x74 + ld1 {v3.h}[7], [x20] + add x3, x2, #0x6a + ld1 {v4.h}[0], [x12] + add x9, x2, #0x54 + ld1 {v5.h}[0], [x13] + add x15, x2, #0x2c + ld1 {v6.h}[0], [x14] + add x19, x2, #0x76 + ld1 {v7.h}[0], [x3] + add x20, x2, #0x78 + ld1 {v4.h}[1], [x9] + add x12, x2, #0x62 + ld1 {v5.h}[1], [x15] + add x13, x2, #0x1e + ld1 {v6.h}[1], [x19] + add x14, x2, #0x68 + ld1 {v7.h}[1], [x20] + add x3, x2, #0x7a + ld1 {v4.h}[2], [x12] + add x9, x2, #0x70 + ld1 {v5.h}[2], [x13] + add x15, x2, #0x2e + ld1 {v6.h}[2], [x14] + add x19, x2, #0x5a + ld1 {v7.h}[2], [x3] + add x20, x2, #0x6c + ld1 {v4.h}[3], [x9] + add x12, x2, #0x72 + ld1 {v5.h}[3], [x15] + add x13, x2, #0x3c + ld1 {v6.h}[3], [x19] + add x14, x2, #0x4c + ld1 {v7.h}[3], [x20] + add x3, x2, #0x5e + ld1 {v4.h}[4], [x12] + add x9, x2, #0x64 + ld1 {v5.h}[4], [x13] + add x15, x2, #0x4a + ld1 {v6.h}[4], [x14] + add x19, x2, #0x3e + ld1 {v7.h}[4], [x3] + add x20, x2, #0x6e + ld1 {v4.h}[5], [x9] + add x12, x2, #0x56 + ld1 {v5.h}[5], [x15] + add x13, x2, #0x58 + ld1 {v6.h}[5], [x19] + add x14, x2, #0x4e + ld1 {v7.h}[5], [x20] + add x3, x2, #0x7c + ld1 {v4.h}[6], [x12] + add x9, x2, #0x48 + ld1 {v5.h}[6], [x13] + add x15, x2, #0x66 + ld1 {v6.h}[6], [x14] + add x19, x2, #0x5c + ld1 {v7.h}[6], [x3] + add x20, x2, #0x7e + ld1 {v4.h}[7], [x9] + ld1 {v5.h}[7], [x15] + ld1 {v6.h}[7], [x19] + ld1 {v7.h}[7], [x20] +.endif + cmlt v24.8h, v0.8h, #0 + cmlt v25.8h, v1.8h, #0 + cmlt v26.8h, v2.8h, #0 + cmlt v27.8h, v3.8h, #0 + cmlt v28.8h, v4.8h, #0 + cmlt v29.8h, v5.8h, #0 + cmlt v30.8h, v6.8h, #0 + cmlt v31.8h, v7.8h, #0 + abs v0.8h, v0.8h + abs v1.8h, v1.8h + abs v2.8h, v2.8h + abs v3.8h, v3.8h + abs v4.8h, v4.8h + abs v5.8h, v5.8h + abs v6.8h, v6.8h + abs v7.8h, v7.8h + eor v24.16b, v24.16b, v0.16b + eor v25.16b, v25.16b, v1.16b + eor v26.16b, v26.16b, v2.16b + eor v27.16b, v27.16b, v3.16b + eor v28.16b, v28.16b, v4.16b + eor v29.16b, v29.16b, v5.16b + eor v30.16b, v30.16b, v6.16b + eor v31.16b, v31.16b, v7.16b + cmeq v16.8h, v0.8h, #0 + cmeq v17.8h, v1.8h, #0 + cmeq v18.8h, v2.8h, #0 + cmeq v19.8h, v3.8h, #0 + cmeq v20.8h, v4.8h, #0 + cmeq v21.8h, v5.8h, #0 + cmeq v22.8h, v6.8h, #0 + xtn v16.8b, v16.8h + xtn v18.8b, v18.8h + xtn v20.8b, v20.8h + xtn v22.8b, v22.8h + umov w14, v0.h[0] + xtn2 v16.16b, v17.8h + umov w13, v24.h[0] + xtn2 v18.16b, v19.8h + clz w14, w14 + xtn2 v20.16b, v21.8h + lsl w13, w13, w14 + cmeq v17.8h, v7.8h, #0 + sub w12, w14, #32 + xtn2 v22.16b, v17.8h + lsr w13, w13, w14 + and v16.16b, v16.16b, v23.16b + neg w12, w12 + and v18.16b, v18.16b, v23.16b + add x3, x4, #0x400 /* r1 = dctbl->ehufsi */ + and v20.16b, v20.16b, v23.16b + add x15, sp, #0x90 /* x15 = t2 */ + and v22.16b, v22.16b, v23.16b + ldr w10, [x4, x12, lsl #2] + addp v16.16b, v16.16b, v18.16b + ldrb w11, [x3, x12] + addp v20.16b, v20.16b, v22.16b + checkbuf47 + addp v16.16b, v16.16b, v20.16b + put_bits x10, x11 + addp v16.16b, v16.16b, v18.16b + checkbuf47 + umov x9,v16.D[0] + put_bits x13, x12 + cnt v17.8b, v16.8b + mvn x9, x9 + addv B18, v17.8b + add x4, x5, #0x400 /* x4 = actbl->ehufsi */ + umov w12, v18.b[0] + lsr x9, x9, #0x1 /* clear AC coeff */ + ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */ + rbit x9, x9 /* x9 = index0 */ + ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */ + cmp w12, #(64-8) + add x11, sp, #16 + b.lt 4f + cbz x9, 6f + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 + st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 + st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 +1: + clz x2, x9 + add x15, x15, x2, lsl #1 + lsl x9, x9, x2 + ldrh w20, [x15, #-126] +2: + cmp x2, #0x10 + b.lt 3f + sub x2, x2, #0x10 + checkbuf47 + put_bits x13, x14 + b 2b +3: + clz w20, w20 + ldrh w3, [x15, #2]! + sub w11, w20, #32 + lsl w3, w3, w20 + neg w11, w11 + lsr w3, w3, w20 + add x2, x11, x2, lsl #4 + lsl x9, x9, #0x1 + ldr w12, [x5, x2, lsl #2] + ldrb w10, [x4, x2] + checkbuf31 + put_bits x12, x10 + put_bits x3, x11 + cbnz x9, 1b + b 6f +4: + movi v21.8h, #0x0010 + clz v0.8h, v0.8h + clz v1.8h, v1.8h + clz v2.8h, v2.8h + clz v3.8h, v3.8h + clz v4.8h, v4.8h + clz v5.8h, v5.8h + clz v6.8h, v6.8h + clz v7.8h, v7.8h + ushl v24.8h, v24.8h, v0.8h + ushl v25.8h, v25.8h, v1.8h + ushl v26.8h, v26.8h, v2.8h + ushl v27.8h, v27.8h, v3.8h + ushl v28.8h, v28.8h, v4.8h + ushl v29.8h, v29.8h, v5.8h + ushl v30.8h, v30.8h, v6.8h + ushl v31.8h, v31.8h, v7.8h + neg v0.8h, v0.8h + neg v1.8h, v1.8h + neg v2.8h, v2.8h + neg v3.8h, v3.8h + neg v4.8h, v4.8h + neg v5.8h, v5.8h + neg v6.8h, v6.8h + neg v7.8h, v7.8h + ushl v24.8h, v24.8h, v0.8h + ushl v25.8h, v25.8h, v1.8h + ushl v26.8h, v26.8h, v2.8h + ushl v27.8h, v27.8h, v3.8h + ushl v28.8h, v28.8h, v4.8h + ushl v29.8h, v29.8h, v5.8h + ushl v30.8h, v30.8h, v6.8h + ushl v31.8h, v31.8h, v7.8h + add v0.8h, v21.8h, v0.8h + add v1.8h, v21.8h, v1.8h + add v2.8h, v21.8h, v2.8h + add v3.8h, v21.8h, v3.8h + add v4.8h, v21.8h, v4.8h + add v5.8h, v21.8h, v5.8h + add v6.8h, v21.8h, v6.8h + add v7.8h, v21.8h, v7.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 + st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 + st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 +1: + clz x2, x9 + add x15, x15, x2, lsl #1 + lsl x9, x9, x2 + ldrh w11, [x15, #-126] +2: + cmp x2, #0x10 + b.lt 3f + sub x2, x2, #0x10 + checkbuf47 + put_bits x13, x14 + b 2b +3: + ldrh w3, [x15, #2]! + add x2, x11, x2, lsl #4 + lsl x9, x9, #0x1 + ldr w12, [x5, x2, lsl #2] + ldrb w10, [x4, x2] + checkbuf31 + put_bits x12, x10 + put_bits x3, x11 + cbnz x9, 1b +6: + add x13, sp, #0x10e + cmp x15, x13 + b.hs 1f + ldr w12, [x5] + ldrb w14, [x4] + checkbuf47 + put_bits x12, x14 +1: + str PUT_BUFFER, [x0, #0x10] + str PUT_BITSw, [x0, #0x18] + ldp x19, x20, [sp], 16 + add x0, BUFFER, #0x1 + add sp, sp, 256 + br x30 + +.endm + +generate_jsimd_huff_encode_one_block 1 +generate_jsimd_huff_encode_one_block 0 + + .unreq BUFFER + .unreq PUT_BUFFER + .unreq PUT_BITS + .unreq PUT_BITSw + +.purgem emit_byte +.purgem put_bits +.purgem checkbuf31 +.purgem checkbuf47 diff --git a/media/libjpeg/simd/jsimd_arm_neon.S b/media/libjpeg/simd/jsimd_arm_neon.S new file mode 100644 index 000000000..cd2612724 --- /dev/null +++ b/media/libjpeg/simd/jsimd_arm_neon.S @@ -0,0 +1,2878 @@ +/* + * ARMv7 NEON optimizations for libjpeg-turbo + * + * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies). + * All Rights Reserved. + * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> + * Copyright (C) 2014, Siarhei Siamashka. All Rights Reserved. + * Copyright (C) 2014, Linaro Limited. All Rights Reserved. + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * Copyright (C) 2015-2016, Matthieu Darbois. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */ +#endif + +.text +.fpu neon +.arch armv7a +.object_arch armv4 +.arm +.syntax unified + + +#define RESPECT_STRICT_ALIGNMENT 1 + + +/*****************************************************************************/ + +/* Supplementary macro for setting function attributes */ +.macro asm_function fname +#ifdef __APPLE__ + .globl _\fname +_\fname: +#else + .global \fname +#ifdef __ELF__ + .hidden \fname + .type \fname, %function +#endif +\fname: +#endif +.endm + +/* Transpose a block of 4x4 coefficients in four 64-bit registers */ +.macro transpose_4x4 x0, x1, x2, x3 + vtrn.16 \x0, \x1 + vtrn.16 \x2, \x3 + vtrn.32 \x0, \x2 + vtrn.32 \x1, \x3 +.endm + + +#define CENTERJSAMPLE 128 + +/*****************************************************************************/ + +/* + * Perform dequantization and inverse DCT on one block of coefficients. + * + * GLOBAL(void) + * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block, + * JSAMPARRAY output_buf, JDIMENSION output_col) + */ + +#define FIX_0_298631336 (2446) +#define FIX_0_390180644 (3196) +#define FIX_0_541196100 (4433) +#define FIX_0_765366865 (6270) +#define FIX_0_899976223 (7373) +#define FIX_1_175875602 (9633) +#define FIX_1_501321110 (12299) +#define FIX_1_847759065 (15137) +#define FIX_1_961570560 (16069) +#define FIX_2_053119869 (16819) +#define FIX_2_562915447 (20995) +#define FIX_3_072711026 (25172) + +#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) +#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) +#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) +#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) +#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) +#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) +#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) +#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) + +/* + * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. + * Uses some ideas from the comments in 'simd/jiss2int-64.asm' + */ +#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \ +{ \ + DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ + JLONG q1, q2, q3, q4, q5, q6, q7; \ + JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \ + \ + /* 1-D iDCT input data */ \ + row0 = xrow0; \ + row1 = xrow1; \ + row2 = xrow2; \ + row3 = xrow3; \ + row4 = xrow4; \ + row5 = xrow5; \ + row6 = xrow6; \ + row7 = xrow7; \ + \ + q5 = row7 + row3; \ + q4 = row5 + row1; \ + q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ + MULTIPLY(q4, FIX_1_175875602); \ + q7 = MULTIPLY(q5, FIX_1_175875602) + \ + MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ + q2 = MULTIPLY(row2, FIX_0_541196100) + \ + MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ + q4 = q6; \ + q3 = ((JLONG) row0 - (JLONG) row4) << 13; \ + q6 += MULTIPLY(row5, -FIX_2_562915447) + \ + MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ + /* now we can use q1 (reloadable constants have been used up) */ \ + q1 = q3 + q2; \ + q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ + MULTIPLY(row1, -FIX_0_899976223); \ + q5 = q7; \ + q1 = q1 + q6; \ + q7 += MULTIPLY(row7, -FIX_0_899976223) + \ + MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ + \ + /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ + tmp11_plus_tmp2 = q1; \ + row1 = 0; \ + \ + q1 = q1 - q6; \ + q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ + MULTIPLY(row3, -FIX_2_562915447); \ + q1 = q1 - q6; \ + q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ + MULTIPLY(row6, FIX_0_541196100); \ + q3 = q3 - q2; \ + \ + /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ + tmp11_minus_tmp2 = q1; \ + \ + q1 = ((JLONG) row0 + (JLONG) row4) << 13; \ + q2 = q1 + q6; \ + q1 = q1 - q6; \ + \ + /* pick up the results */ \ + tmp0 = q4; \ + tmp1 = q5; \ + tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ + tmp3 = q7; \ + tmp10 = q2; \ + tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ + tmp12 = q3; \ + tmp13 = q1; \ +} + +#define XFIX_0_899976223 d0[0] +#define XFIX_0_541196100 d0[1] +#define XFIX_2_562915447 d0[2] +#define XFIX_0_298631336_MINUS_0_899976223 d0[3] +#define XFIX_1_501321110_MINUS_0_899976223 d1[0] +#define XFIX_2_053119869_MINUS_2_562915447 d1[1] +#define XFIX_0_541196100_PLUS_0_765366865 d1[2] +#define XFIX_1_175875602 d1[3] +#define XFIX_1_175875602_MINUS_0_390180644 d2[0] +#define XFIX_0_541196100_MINUS_1_847759065 d2[1] +#define XFIX_3_072711026_MINUS_2_562915447 d2[2] +#define XFIX_1_175875602_MINUS_1_961570560 d2[3] + +.balign 16 +jsimd_idct_islow_neon_consts: + .short FIX_0_899976223 /* d0[0] */ + .short FIX_0_541196100 /* d0[1] */ + .short FIX_2_562915447 /* d0[2] */ + .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ + .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ + .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ + .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ + .short FIX_1_175875602 /* d1[3] */ + /* reloadable constants */ + .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ + .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ + .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ + .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ + +asm_function jsimd_idct_islow_neon + + DCT_TABLE .req r0 + COEF_BLOCK .req r1 + OUTPUT_BUF .req r2 + OUTPUT_COL .req r3 + TMP1 .req r0 + TMP2 .req r1 + TMP3 .req r2 + TMP4 .req ip + + ROW0L .req d16 + ROW0R .req d17 + ROW1L .req d18 + ROW1R .req d19 + ROW2L .req d20 + ROW2R .req d21 + ROW3L .req d22 + ROW3R .req d23 + ROW4L .req d24 + ROW4R .req d25 + ROW5L .req d26 + ROW5R .req d27 + ROW6L .req d28 + ROW6R .req d29 + ROW7L .req d30 + ROW7R .req d31 + + /* Load and dequantize coefficients into NEON registers + * with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 ( q8 ) + * 1 | d18 | d19 ( q9 ) + * 2 | d20 | d21 ( q10 ) + * 3 | d22 | d23 ( q11 ) + * 4 | d24 | d25 ( q12 ) + * 5 | d26 | d27 ( q13 ) + * 6 | d28 | d29 ( q14 ) + * 7 | d30 | d31 ( q15 ) + */ + adr ip, jsimd_idct_islow_neon_consts + vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! + vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! + vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! + vmul.s16 q8, q8, q0 + vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! + vmul.s16 q9, q9, q1 + vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! + vmul.s16 q10, q10, q2 + vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! + vmul.s16 q11, q11, q3 + vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] + vmul.s16 q12, q12, q0 + vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! + vmul.s16 q14, q14, q2 + vmul.s16 q13, q13, q1 + vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */ + add ip, ip, #16 + vmul.s16 q15, q15, q3 + vpush {d8-d15} /* save NEON registers */ + /* 1-D IDCT, pass 1, left 4x8 half */ + vadd.s16 d4, ROW7L, ROW3L + vadd.s16 d5, ROW5L, ROW1L + vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560 + vmlal.s16 q6, d5, XFIX_1_175875602 + vmull.s16 q7, d4, XFIX_1_175875602 + /* Check for the zero coefficients in the right 4x8 half */ + push {r4, r5} + vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644 + vsubl.s16 q3, ROW0L, ROW4L + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] + vmull.s16 q2, ROW2L, XFIX_0_541196100 + vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 + orr r0, r4, r5 + vmov q4, q6 + vmlsl.s16 q6, ROW5L, XFIX_2_562915447 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] + vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 + vshl.s32 q3, q3, #13 + orr r0, r0, r4 + vmlsl.s16 q4, ROW1L, XFIX_0_899976223 + orr r0, r0, r5 + vadd.s32 q1, q3, q2 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] + vmov q5, q7 + vadd.s32 q1, q1, q6 + orr r0, r0, r4 + vmlsl.s16 q7, ROW7L, XFIX_0_899976223 + orr r0, r0, r5 + vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 + vrshrn.s32 ROW1L, q1, #11 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] + vsub.s32 q1, q1, q6 + vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 + orr r0, r0, r4 + vmlsl.s16 q5, ROW3L, XFIX_2_562915447 + orr r0, r0, r5 + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] + vmlal.s16 q6, ROW6L, XFIX_0_541196100 + vsub.s32 q3, q3, q2 + orr r0, r0, r4 + vrshrn.s32 ROW6L, q1, #11 + orr r0, r0, r5 + vadd.s32 q1, q3, q5 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] + vsub.s32 q3, q3, q5 + vaddl.s16 q5, ROW0L, ROW4L + orr r0, r0, r4 + vrshrn.s32 ROW2L, q1, #11 + orr r0, r0, r5 + vrshrn.s32 ROW5L, q3, #11 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] + vshl.s32 q5, q5, #13 + vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 + orr r0, r0, r4 + vadd.s32 q2, q5, q6 + orrs r0, r0, r5 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + orr r0, r4, r5 + vsub.s32 q3, q1, q4 + pop {r4, r5} + vrshrn.s32 ROW7L, q2, #11 + vrshrn.s32 ROW3L, q5, #11 + vrshrn.s32 ROW0L, q6, #11 + vrshrn.s32 ROW4L, q3, #11 + + beq 3f /* Go to do some special handling for the sparse + right 4x8 half */ + + /* 1-D IDCT, pass 1, right 4x8 half */ + vld1.s16 {d2}, [ip, :64] /* reload constants */ + vadd.s16 d10, ROW7R, ROW3R + vadd.s16 d8, ROW5R, ROW1R + /* Transpose left 4x8 half */ + vtrn.16 ROW6L, ROW7L + vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 + vmlal.s16 q6, d8, XFIX_1_175875602 + vtrn.16 ROW2L, ROW3L + vmull.s16 q7, d10, XFIX_1_175875602 + vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 + vtrn.16 ROW0L, ROW1L + vsubl.s16 q3, ROW0R, ROW4R + vmull.s16 q2, ROW2R, XFIX_0_541196100 + vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 + vtrn.16 ROW4L, ROW5L + vmov q4, q6 + vmlsl.s16 q6, ROW5R, XFIX_2_562915447 + vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447 + vtrn.32 ROW1L, ROW3L + vshl.s32 q3, q3, #13 + vmlsl.s16 q4, ROW1R, XFIX_0_899976223 + vtrn.32 ROW4L, ROW6L + vadd.s32 q1, q3, q2 + vmov q5, q7 + vadd.s32 q1, q1, q6 + vtrn.32 ROW0L, ROW2L + vmlsl.s16 q7, ROW7R, XFIX_0_899976223 + vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223 + vrshrn.s32 ROW1R, q1, #11 + vtrn.32 ROW5L, ROW7L + vsub.s32 q1, q1, q6 + vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 + vmlsl.s16 q5, ROW3R, XFIX_2_562915447 + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865 + vmlal.s16 q6, ROW6R, XFIX_0_541196100 + vsub.s32 q3, q3, q2 + vrshrn.s32 ROW6R, q1, #11 + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vaddl.s16 q5, ROW0R, ROW4R + vrshrn.s32 ROW2R, q1, #11 + vrshrn.s32 ROW5R, q3, #11 + vshl.s32 q5, q5, #13 + vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vrshrn.s32 ROW7R, q2, #11 + vrshrn.s32 ROW3R, q5, #11 + vrshrn.s32 ROW0R, q6, #11 + vrshrn.s32 ROW4R, q3, #11 + /* Transpose right 4x8 half */ + vtrn.16 ROW6R, ROW7R + vtrn.16 ROW2R, ROW3R + vtrn.16 ROW0R, ROW1R + vtrn.16 ROW4R, ROW5R + vtrn.32 ROW1R, ROW3R + vtrn.32 ROW4R, ROW6R + vtrn.32 ROW0R, ROW2R + vtrn.32 ROW5R, ROW7R + +1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ + vld1.s16 {d2}, [ip, :64] /* reload constants */ + vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */ + vmlal.s16 q6, ROW1L, XFIX_1_175875602 + vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ + vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 + vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */ + vmlal.s16 q7, ROW3L, XFIX_1_175875602 + vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ + vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 + vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */ + vmull.s16 q2, ROW2L, XFIX_0_541196100 + vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */ + vmov q4, q6 + vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */ + vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 + vshl.s32 q3, q3, #13 + vmlsl.s16 q4, ROW1L, XFIX_0_899976223 + vadd.s32 q1, q3, q2 + vmov q5, q7 + vadd.s32 q1, q1, q6 + vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */ + vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 + vshrn.s32 ROW1L, q1, #16 + vsub.s32 q1, q1, q6 + vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */ + vmlsl.s16 q5, ROW3L, XFIX_2_562915447 + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 + vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */ + vsub.s32 q3, q3, q2 + vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */ + vshrn.s32 ROW2L, q1, #16 + vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ + vshl.s32 q5, q5, #13 + vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */ + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ + vshrn.s32 ROW3L, q5, #16 + vshrn.s32 ROW0L, q6, #16 + vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ + /* 1-D IDCT, pass 2, right 4x8 half */ + vld1.s16 {d2}, [ip, :64] /* reload constants */ + vmull.s16 q6, ROW5R, XFIX_1_175875602 + vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */ + vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560 + vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ + vmull.s16 q7, ROW7R, XFIX_1_175875602 + vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */ + vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644 + vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ + vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */ + vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */ + vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 + vmov q4, q6 + vmlsl.s16 q6, ROW5R, XFIX_2_562915447 + vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */ + vshl.s32 q3, q3, #13 + vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */ + vadd.s32 q1, q3, q2 + vmov q5, q7 + vadd.s32 q1, q1, q6 + vmlsl.s16 q7, ROW7R, XFIX_0_899976223 + vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */ + vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ + vsub.s32 q1, q1, q6 + vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 + vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */ + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */ + vmlal.s16 q6, ROW6R, XFIX_0_541196100 + vsub.s32 q3, q3, q2 + vshrn.s32 ROW6R, q1, #16 + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */ + vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ + vshrn.s32 ROW5R, q3, #16 + vshl.s32 q5, q5, #13 + vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vshrn.s32 ROW7R, q2, #16 + vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ + vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ + vshrn.s32 ROW4R, q3, #16 + +2: /* Descale to 8-bit and range limit */ + vqrshrn.s16 d16, q8, #2 + vqrshrn.s16 d17, q9, #2 + vqrshrn.s16 d18, q10, #2 + vqrshrn.s16 d19, q11, #2 + vpop {d8-d15} /* restore NEON registers */ + vqrshrn.s16 d20, q12, #2 + /* Transpose the final 8-bit samples and do signed->unsigned conversion */ + vtrn.16 q8, q9 + vqrshrn.s16 d21, q13, #2 + vqrshrn.s16 d22, q14, #2 + vmov.u8 q0, #(CENTERJSAMPLE) + vqrshrn.s16 d23, q15, #2 + vtrn.8 d16, d17 + vtrn.8 d18, d19 + vadd.u8 q8, q8, q0 + vadd.u8 q9, q9, q0 + vtrn.16 q10, q11 + /* Store results to the output buffer */ + ldmia OUTPUT_BUF!, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + vst1.8 {d16}, [TMP1] + vtrn.8 d20, d21 + vst1.8 {d17}, [TMP2] + ldmia OUTPUT_BUF!, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + vst1.8 {d18}, [TMP1] + vadd.u8 q10, q10, q0 + vst1.8 {d19}, [TMP2] + ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + add TMP3, TMP3, OUTPUT_COL + add TMP4, TMP4, OUTPUT_COL + vtrn.8 d22, d23 + vst1.8 {d20}, [TMP1] + vadd.u8 q11, q11, q0 + vst1.8 {d21}, [TMP2] + vst1.8 {d22}, [TMP3] + vst1.8 {d23}, [TMP4] + bx lr + +3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ + + /* Transpose left 4x8 half */ + vtrn.16 ROW6L, ROW7L + vtrn.16 ROW2L, ROW3L + vtrn.16 ROW0L, ROW1L + vtrn.16 ROW4L, ROW5L + vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */ + vtrn.32 ROW1L, ROW3L + vtrn.32 ROW4L, ROW6L + vtrn.32 ROW0L, ROW2L + vtrn.32 ROW5L, ROW7L + + cmp r0, #0 + beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second + pass */ + + /* Only row 0 is non-zero for the right 4x8 half */ + vdup.s16 ROW1R, ROW0R[1] + vdup.s16 ROW2R, ROW0R[2] + vdup.s16 ROW3R, ROW0R[3] + vdup.s16 ROW4R, ROW0R[0] + vdup.s16 ROW5R, ROW0R[1] + vdup.s16 ROW6R, ROW0R[2] + vdup.s16 ROW7R, ROW0R[3] + vdup.s16 ROW0R, ROW0R[0] + b 1b /* Go to 'normal' second pass */ + +4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ + vld1.s16 {d2}, [ip, :64] /* reload constants */ + vmull.s16 q6, ROW1L, XFIX_1_175875602 + vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 + vmull.s16 q7, ROW3L, XFIX_1_175875602 + vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 + vmull.s16 q2, ROW2L, XFIX_0_541196100 + vshll.s16 q3, ROW0L, #13 + vmov q4, q6 + vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 + vmlsl.s16 q4, ROW1L, XFIX_0_899976223 + vadd.s32 q1, q3, q2 + vmov q5, q7 + vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 + vadd.s32 q1, q1, q6 + vadd.s32 q6, q6, q6 + vmlsl.s16 q5, ROW3L, XFIX_2_562915447 + vshrn.s32 ROW1L, q1, #16 + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 + vsub.s32 q3, q3, q2 + vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vshll.s16 q5, ROW0L, #13 + vshrn.s32 ROW2L, q1, #16 + vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ + vshrn.s32 ROW3L, q5, #16 + vshrn.s32 ROW0L, q6, #16 + vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ + /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ + vld1.s16 {d2}, [ip, :64] /* reload constants */ + vmull.s16 q6, ROW5L, XFIX_1_175875602 + vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 + vmull.s16 q7, ROW7L, XFIX_1_175875602 + vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 + vmull.s16 q2, ROW6L, XFIX_0_541196100 + vshll.s16 q3, ROW4L, #13 + vmov q4, q6 + vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 + vmlsl.s16 q4, ROW5L, XFIX_0_899976223 + vadd.s32 q1, q3, q2 + vmov q5, q7 + vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 + vadd.s32 q1, q1, q6 + vadd.s32 q6, q6, q6 + vmlsl.s16 q5, ROW7L, XFIX_2_562915447 + vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 + vsub.s32 q3, q3, q2 + vshrn.s32 ROW6R, q1, #16 + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vshll.s16 q5, ROW4L, #13 + vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ + vshrn.s32 ROW5R, q3, #16 + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vshrn.s32 ROW7R, q2, #16 + vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ + vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ + vshrn.s32 ROW4R, q3, #16 + b 2b /* Go to epilogue */ + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + + .unreq ROW0L + .unreq ROW0R + .unreq ROW1L + .unreq ROW1R + .unreq ROW2L + .unreq ROW2R + .unreq ROW3L + .unreq ROW3R + .unreq ROW4L + .unreq ROW4R + .unreq ROW5L + .unreq ROW5R + .unreq ROW6L + .unreq ROW6R + .unreq ROW7L + .unreq ROW7R + + +/*****************************************************************************/ + +/* + * jsimd_idct_ifast_neon + * + * This function contains a fast, not so accurate integer implementation of + * the inverse DCT (Discrete Cosine Transform). It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' + * function from jidctfst.c + * + * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. + * But in ARM NEON case some extra additions are required because VQDMULH + * instruction can't handle the constants larger than 1. So the expressions + * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", + * which introduces an extra addition. Overall, there are 6 extra additions + * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. + */ + +#define XFIX_1_082392200 d0[0] +#define XFIX_1_414213562 d0[1] +#define XFIX_1_847759065 d0[2] +#define XFIX_2_613125930 d0[3] + +.balign 16 +jsimd_idct_ifast_neon_consts: + .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ + .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ + .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ + .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ + +asm_function jsimd_idct_ifast_neon + + DCT_TABLE .req r0 + COEF_BLOCK .req r1 + OUTPUT_BUF .req r2 + OUTPUT_COL .req r3 + TMP1 .req r0 + TMP2 .req r1 + TMP3 .req r2 + TMP4 .req ip + + /* Load and dequantize coefficients into NEON registers + * with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 ( q8 ) + * 1 | d18 | d19 ( q9 ) + * 2 | d20 | d21 ( q10 ) + * 3 | d22 | d23 ( q11 ) + * 4 | d24 | d25 ( q12 ) + * 5 | d26 | d27 ( q13 ) + * 6 | d28 | d29 ( q14 ) + * 7 | d30 | d31 ( q15 ) + */ + adr ip, jsimd_idct_ifast_neon_consts + vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! + vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! + vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! + vmul.s16 q8, q8, q0 + vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! + vmul.s16 q9, q9, q1 + vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! + vmul.s16 q10, q10, q2 + vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! + vmul.s16 q11, q11, q3 + vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] + vmul.s16 q12, q12, q0 + vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! + vmul.s16 q14, q14, q2 + vmul.s16 q13, q13, q1 + vld1.16 {d0}, [ip, :64] /* load constants */ + vmul.s16 q15, q15, q3 + vpush {d8-d13} /* save NEON registers */ + /* 1-D IDCT, pass 1 */ + vsub.s16 q2, q10, q14 + vadd.s16 q14, q10, q14 + vsub.s16 q1, q11, q13 + vadd.s16 q13, q11, q13 + vsub.s16 q5, q9, q15 + vadd.s16 q15, q9, q15 + vqdmulh.s16 q4, q2, XFIX_1_414213562 + vqdmulh.s16 q6, q1, XFIX_2_613125930 + vadd.s16 q3, q1, q1 + vsub.s16 q1, q5, q1 + vadd.s16 q10, q2, q4 + vqdmulh.s16 q4, q1, XFIX_1_847759065 + vsub.s16 q2, q15, q13 + vadd.s16 q3, q3, q6 + vqdmulh.s16 q6, q2, XFIX_1_414213562 + vadd.s16 q1, q1, q4 + vqdmulh.s16 q4, q5, XFIX_1_082392200 + vsub.s16 q10, q10, q14 + vadd.s16 q2, q2, q6 + vsub.s16 q6, q8, q12 + vadd.s16 q12, q8, q12 + vadd.s16 q9, q5, q4 + vadd.s16 q5, q6, q10 + vsub.s16 q10, q6, q10 + vadd.s16 q6, q15, q13 + vadd.s16 q8, q12, q14 + vsub.s16 q3, q6, q3 + vsub.s16 q12, q12, q14 + vsub.s16 q3, q3, q1 + vsub.s16 q1, q9, q1 + vadd.s16 q2, q3, q2 + vsub.s16 q15, q8, q6 + vadd.s16 q1, q1, q2 + vadd.s16 q8, q8, q6 + vadd.s16 q14, q5, q3 + vsub.s16 q9, q5, q3 + vsub.s16 q13, q10, q2 + vadd.s16 q10, q10, q2 + /* Transpose */ + vtrn.16 q8, q9 + vsub.s16 q11, q12, q1 + vtrn.16 q14, q15 + vadd.s16 q12, q12, q1 + vtrn.16 q10, q11 + vtrn.16 q12, q13 + vtrn.32 q9, q11 + vtrn.32 q12, q14 + vtrn.32 q8, q10 + vtrn.32 q13, q15 + vswp d28, d21 + vswp d26, d19 + /* 1-D IDCT, pass 2 */ + vsub.s16 q2, q10, q14 + vswp d30, d23 + vadd.s16 q14, q10, q14 + vswp d24, d17 + vsub.s16 q1, q11, q13 + vadd.s16 q13, q11, q13 + vsub.s16 q5, q9, q15 + vadd.s16 q15, q9, q15 + vqdmulh.s16 q4, q2, XFIX_1_414213562 + vqdmulh.s16 q6, q1, XFIX_2_613125930 + vadd.s16 q3, q1, q1 + vsub.s16 q1, q5, q1 + vadd.s16 q10, q2, q4 + vqdmulh.s16 q4, q1, XFIX_1_847759065 + vsub.s16 q2, q15, q13 + vadd.s16 q3, q3, q6 + vqdmulh.s16 q6, q2, XFIX_1_414213562 + vadd.s16 q1, q1, q4 + vqdmulh.s16 q4, q5, XFIX_1_082392200 + vsub.s16 q10, q10, q14 + vadd.s16 q2, q2, q6 + vsub.s16 q6, q8, q12 + vadd.s16 q12, q8, q12 + vadd.s16 q9, q5, q4 + vadd.s16 q5, q6, q10 + vsub.s16 q10, q6, q10 + vadd.s16 q6, q15, q13 + vadd.s16 q8, q12, q14 + vsub.s16 q3, q6, q3 + vsub.s16 q12, q12, q14 + vsub.s16 q3, q3, q1 + vsub.s16 q1, q9, q1 + vadd.s16 q2, q3, q2 + vsub.s16 q15, q8, q6 + vadd.s16 q1, q1, q2 + vadd.s16 q8, q8, q6 + vadd.s16 q14, q5, q3 + vsub.s16 q9, q5, q3 + vsub.s16 q13, q10, q2 + vpop {d8-d13} /* restore NEON registers */ + vadd.s16 q10, q10, q2 + vsub.s16 q11, q12, q1 + vadd.s16 q12, q12, q1 + /* Descale to 8-bit and range limit */ + vmov.u8 q0, #0x80 + vqshrn.s16 d16, q8, #5 + vqshrn.s16 d17, q9, #5 + vqshrn.s16 d18, q10, #5 + vqshrn.s16 d19, q11, #5 + vqshrn.s16 d20, q12, #5 + vqshrn.s16 d21, q13, #5 + vqshrn.s16 d22, q14, #5 + vqshrn.s16 d23, q15, #5 + vadd.u8 q8, q8, q0 + vadd.u8 q9, q9, q0 + vadd.u8 q10, q10, q0 + vadd.u8 q11, q11, q0 + /* Transpose the final 8-bit samples */ + vtrn.16 q8, q9 + vtrn.16 q10, q11 + vtrn.32 q8, q10 + vtrn.32 q9, q11 + vtrn.8 d16, d17 + vtrn.8 d18, d19 + /* Store results to the output buffer */ + ldmia OUTPUT_BUF!, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + vst1.8 {d16}, [TMP1] + vst1.8 {d17}, [TMP2] + ldmia OUTPUT_BUF!, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + vst1.8 {d18}, [TMP1] + vtrn.8 d20, d21 + vst1.8 {d19}, [TMP2] + ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + add TMP3, TMP3, OUTPUT_COL + add TMP4, TMP4, OUTPUT_COL + vst1.8 {d20}, [TMP1] + vtrn.8 d22, d23 + vst1.8 {d21}, [TMP2] + vst1.8 {d22}, [TMP3] + vst1.8 {d23}, [TMP4] + bx lr + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + + +/*****************************************************************************/ + +/* + * jsimd_idct_4x4_neon + * + * This function contains inverse-DCT code for getting reduced-size + * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' + * function from jpeg-6b (jidctred.c). + * + * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which + * requires much less arithmetic operations and hence should be faster. + * The primary purpose of this particular NEON optimized function is + * bit exact compatibility with jpeg-6b. + * + * TODO: a bit better instructions scheduling can be achieved by expanding + * idct_helper/transpose_4x4 macros and reordering instructions, + * but readability will suffer somewhat. + */ + +#define CONST_BITS 13 + +#define FIX_0_211164243 (1730) /* FIX(0.211164243) */ +#define FIX_0_509795579 (4176) /* FIX(0.509795579) */ +#define FIX_0_601344887 (4926) /* FIX(0.601344887) */ +#define FIX_0_720959822 (5906) /* FIX(0.720959822) */ +#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ +#define FIX_0_850430095 (6967) /* FIX(0.850430095) */ +#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ +#define FIX_1_061594337 (8697) /* FIX(1.061594337) */ +#define FIX_1_272758580 (10426) /* FIX(1.272758580) */ +#define FIX_1_451774981 (11893) /* FIX(1.451774981) */ +#define FIX_1_847759065 (15137) /* FIX(1.847759065) */ +#define FIX_2_172734803 (17799) /* FIX(2.172734803) */ +#define FIX_2_562915447 (20995) /* FIX(2.562915447) */ +#define FIX_3_624509785 (29692) /* FIX(3.624509785) */ + +.balign 16 +jsimd_idct_4x4_neon_consts: + .short FIX_1_847759065 /* d0[0] */ + .short -FIX_0_765366865 /* d0[1] */ + .short -FIX_0_211164243 /* d0[2] */ + .short FIX_1_451774981 /* d0[3] */ + .short -FIX_2_172734803 /* d1[0] */ + .short FIX_1_061594337 /* d1[1] */ + .short -FIX_0_509795579 /* d1[2] */ + .short -FIX_0_601344887 /* d1[3] */ + .short FIX_0_899976223 /* d2[0] */ + .short FIX_2_562915447 /* d2[1] */ + .short 1 << (CONST_BITS+1) /* d2[2] */ + .short 0 /* d2[3] */ + +.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 + vmull.s16 q14, \x4, d2[2] + vmlal.s16 q14, \x8, d0[0] + vmlal.s16 q14, \x14, d0[1] + + vmull.s16 q13, \x16, d1[2] + vmlal.s16 q13, \x12, d1[3] + vmlal.s16 q13, \x10, d2[0] + vmlal.s16 q13, \x6, d2[1] + + vmull.s16 q15, \x4, d2[2] + vmlsl.s16 q15, \x8, d0[0] + vmlsl.s16 q15, \x14, d0[1] + + vmull.s16 q12, \x16, d0[2] + vmlal.s16 q12, \x12, d0[3] + vmlal.s16 q12, \x10, d1[0] + vmlal.s16 q12, \x6, d1[1] + + vadd.s32 q10, q14, q13 + vsub.s32 q14, q14, q13 + + .if \shift > 16 + vrshr.s32 q10, q10, #\shift + vrshr.s32 q14, q14, #\shift + vmovn.s32 \y26, q10 + vmovn.s32 \y29, q14 + .else + vrshrn.s32 \y26, q10, #\shift + vrshrn.s32 \y29, q14, #\shift + .endif + + vadd.s32 q10, q15, q12 + vsub.s32 q15, q15, q12 + + .if \shift > 16 + vrshr.s32 q10, q10, #\shift + vrshr.s32 q15, q15, #\shift + vmovn.s32 \y27, q10 + vmovn.s32 \y28, q15 + .else + vrshrn.s32 \y27, q10, #\shift + vrshrn.s32 \y28, q15, #\shift + .endif +.endm + +asm_function jsimd_idct_4x4_neon + + DCT_TABLE .req r0 + COEF_BLOCK .req r1 + OUTPUT_BUF .req r2 + OUTPUT_COL .req r3 + TMP1 .req r0 + TMP2 .req r1 + TMP3 .req r2 + TMP4 .req ip + + vpush {d8-d15} + + /* Load constants (d3 is just used for padding) */ + adr TMP4, jsimd_idct_4x4_neon_consts + vld1.16 {d0, d1, d2, d3}, [TMP4, :128] + + /* Load all COEF_BLOCK into NEON registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d4 | d5 + * 1 | d6 | d7 + * 2 | d8 | d9 + * 3 | d10 | d11 + * 4 | - | - + * 5 | d12 | d13 + * 6 | d14 | d15 + * 7 | d16 | d17 + */ + vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! + vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]! + add COEF_BLOCK, COEF_BLOCK, #16 + vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]! + vld1.16 {d16, d17}, [COEF_BLOCK, :128]! + /* dequantize */ + vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! + vmul.s16 q2, q2, q9 + vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]! + vmul.s16 q3, q3, q10 + vmul.s16 q4, q4, q11 + add DCT_TABLE, DCT_TABLE, #16 + vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]! + vmul.s16 q5, q5, q12 + vmul.s16 q6, q6, q13 + vld1.16 {d30, d31}, [DCT_TABLE, :128]! + vmul.s16 q7, q7, q14 + vmul.s16 q8, q8, q15 + + /* Pass 1 */ + idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10 + transpose_4x4 d4, d6, d8, d10 + idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11 + transpose_4x4 d5, d7, d9, d11 + + /* Pass 2 */ + idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29 + transpose_4x4 d26, d27, d28, d29 + + /* Range limit */ + vmov.u16 q15, #0x80 + vadd.s16 q13, q13, q15 + vadd.s16 q14, q14, q15 + vqmovun.s16 d26, q13 + vqmovun.s16 d27, q14 + + /* Store results to the output buffer */ + ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + add TMP3, TMP3, OUTPUT_COL + add TMP4, TMP4, OUTPUT_COL + +#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT + /* We can use much less instructions on little endian systems if the + * OS kernel is not configured to trap unaligned memory accesses + */ + vst1.32 {d26[0]}, [TMP1]! + vst1.32 {d27[0]}, [TMP3]! + vst1.32 {d26[1]}, [TMP2]! + vst1.32 {d27[1]}, [TMP4]! +#else + vst1.8 {d26[0]}, [TMP1]! + vst1.8 {d27[0]}, [TMP3]! + vst1.8 {d26[1]}, [TMP1]! + vst1.8 {d27[1]}, [TMP3]! + vst1.8 {d26[2]}, [TMP1]! + vst1.8 {d27[2]}, [TMP3]! + vst1.8 {d26[3]}, [TMP1]! + vst1.8 {d27[3]}, [TMP3]! + + vst1.8 {d26[4]}, [TMP2]! + vst1.8 {d27[4]}, [TMP4]! + vst1.8 {d26[5]}, [TMP2]! + vst1.8 {d27[5]}, [TMP4]! + vst1.8 {d26[6]}, [TMP2]! + vst1.8 {d27[6]}, [TMP4]! + vst1.8 {d26[7]}, [TMP2]! + vst1.8 {d27[7]}, [TMP4]! +#endif + + vpop {d8-d15} + bx lr + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + +.purgem idct_helper + + +/*****************************************************************************/ + +/* + * jsimd_idct_2x2_neon + * + * This function contains inverse-DCT code for getting reduced-size + * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' + * function from jpeg-6b (jidctred.c). + * + * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which + * requires much less arithmetic operations and hence should be faster. + * The primary purpose of this particular NEON optimized function is + * bit exact compatibility with jpeg-6b. + */ + +.balign 8 +jsimd_idct_2x2_neon_consts: + .short -FIX_0_720959822 /* d0[0] */ + .short FIX_0_850430095 /* d0[1] */ + .short -FIX_1_272758580 /* d0[2] */ + .short FIX_3_624509785 /* d0[3] */ + +.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 + vshll.s16 q14, \x4, #15 + vmull.s16 q13, \x6, d0[3] + vmlal.s16 q13, \x10, d0[2] + vmlal.s16 q13, \x12, d0[1] + vmlal.s16 q13, \x16, d0[0] + + vadd.s32 q10, q14, q13 + vsub.s32 q14, q14, q13 + + .if \shift > 16 + vrshr.s32 q10, q10, #\shift + vrshr.s32 q14, q14, #\shift + vmovn.s32 \y26, q10 + vmovn.s32 \y27, q14 + .else + vrshrn.s32 \y26, q10, #\shift + vrshrn.s32 \y27, q14, #\shift + .endif +.endm + +asm_function jsimd_idct_2x2_neon + + DCT_TABLE .req r0 + COEF_BLOCK .req r1 + OUTPUT_BUF .req r2 + OUTPUT_COL .req r3 + TMP1 .req r0 + TMP2 .req ip + + vpush {d8-d15} + + /* Load constants */ + adr TMP2, jsimd_idct_2x2_neon_consts + vld1.16 {d0}, [TMP2, :64] + + /* Load all COEF_BLOCK into NEON registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d4 | d5 + * 1 | d6 | d7 + * 2 | - | - + * 3 | d10 | d11 + * 4 | - | - + * 5 | d12 | d13 + * 6 | - | - + * 7 | d16 | d17 + */ + vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! + add COEF_BLOCK, COEF_BLOCK, #16 + vld1.16 {d10, d11}, [COEF_BLOCK, :128]! + add COEF_BLOCK, COEF_BLOCK, #16 + vld1.16 {d12, d13}, [COEF_BLOCK, :128]! + add COEF_BLOCK, COEF_BLOCK, #16 + vld1.16 {d16, d17}, [COEF_BLOCK, :128]! + /* Dequantize */ + vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! + vmul.s16 q2, q2, q9 + vmul.s16 q3, q3, q10 + add DCT_TABLE, DCT_TABLE, #16 + vld1.16 {d24, d25}, [DCT_TABLE, :128]! + vmul.s16 q5, q5, q12 + add DCT_TABLE, DCT_TABLE, #16 + vld1.16 {d26, d27}, [DCT_TABLE, :128]! + vmul.s16 q6, q6, q13 + add DCT_TABLE, DCT_TABLE, #16 + vld1.16 {d30, d31}, [DCT_TABLE, :128]! + vmul.s16 q8, q8, q15 + + /* Pass 1 */ +#if 0 + idct_helper d4, d6, d10, d12, d16, 13, d4, d6 + transpose_4x4 d4, d6, d8, d10 + idct_helper d5, d7, d11, d13, d17, 13, d5, d7 + transpose_4x4 d5, d7, d9, d11 +#else + vmull.s16 q13, d6, d0[3] + vmlal.s16 q13, d10, d0[2] + vmlal.s16 q13, d12, d0[1] + vmlal.s16 q13, d16, d0[0] + vmull.s16 q12, d7, d0[3] + vmlal.s16 q12, d11, d0[2] + vmlal.s16 q12, d13, d0[1] + vmlal.s16 q12, d17, d0[0] + vshll.s16 q14, d4, #15 + vshll.s16 q15, d5, #15 + vadd.s32 q10, q14, q13 + vsub.s32 q14, q14, q13 + vrshrn.s32 d4, q10, #13 + vrshrn.s32 d6, q14, #13 + vadd.s32 q10, q15, q12 + vsub.s32 q14, q15, q12 + vrshrn.s32 d5, q10, #13 + vrshrn.s32 d7, q14, #13 + vtrn.16 q2, q3 + vtrn.32 q3, q5 +#endif + + /* Pass 2 */ + idct_helper d4, d6, d10, d7, d11, 20, d26, d27 + + /* Range limit */ + vmov.u16 q15, #0x80 + vadd.s16 q13, q13, q15 + vqmovun.s16 d26, q13 + vqmovun.s16 d27, q13 + + /* Store results to the output buffer */ + ldmia OUTPUT_BUF, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + + vst1.8 {d26[0]}, [TMP1]! + vst1.8 {d27[4]}, [TMP1]! + vst1.8 {d26[1]}, [TMP2]! + vst1.8 {d27[5]}, [TMP2]! + + vpop {d8-d15} + bx lr + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + +.purgem idct_helper + + +/*****************************************************************************/ + +/* + * jsimd_ycc_extrgb_convert_neon + * jsimd_ycc_extbgr_convert_neon + * jsimd_ycc_extrgbx_convert_neon + * jsimd_ycc_extbgrx_convert_neon + * jsimd_ycc_extxbgr_convert_neon + * jsimd_ycc_extxrgb_convert_neon + * + * Colorspace conversion YCbCr -> RGB + */ + + +.macro do_load size + .if \size == 8 + vld1.8 {d4}, [U, :64]! + vld1.8 {d5}, [V, :64]! + vld1.8 {d0}, [Y, :64]! + pld [U, #64] + pld [V, #64] + pld [Y, #64] + .elseif \size == 4 + vld1.8 {d4[0]}, [U]! + vld1.8 {d4[1]}, [U]! + vld1.8 {d4[2]}, [U]! + vld1.8 {d4[3]}, [U]! + vld1.8 {d5[0]}, [V]! + vld1.8 {d5[1]}, [V]! + vld1.8 {d5[2]}, [V]! + vld1.8 {d5[3]}, [V]! + vld1.8 {d0[0]}, [Y]! + vld1.8 {d0[1]}, [Y]! + vld1.8 {d0[2]}, [Y]! + vld1.8 {d0[3]}, [Y]! + .elseif \size == 2 + vld1.8 {d4[4]}, [U]! + vld1.8 {d4[5]}, [U]! + vld1.8 {d5[4]}, [V]! + vld1.8 {d5[5]}, [V]! + vld1.8 {d0[4]}, [Y]! + vld1.8 {d0[5]}, [Y]! + .elseif \size == 1 + vld1.8 {d4[6]}, [U]! + vld1.8 {d5[6]}, [V]! + vld1.8 {d0[6]}, [Y]! + .else + .error unsupported macroblock size + .endif +.endm + +.macro do_store bpp, size + .if \bpp == 24 + .if \size == 8 + vst3.8 {d10, d11, d12}, [RGB]! + .elseif \size == 4 + vst3.8 {d10[0], d11[0], d12[0]}, [RGB]! + vst3.8 {d10[1], d11[1], d12[1]}, [RGB]! + vst3.8 {d10[2], d11[2], d12[2]}, [RGB]! + vst3.8 {d10[3], d11[3], d12[3]}, [RGB]! + .elseif \size == 2 + vst3.8 {d10[4], d11[4], d12[4]}, [RGB]! + vst3.8 {d10[5], d11[5], d12[5]}, [RGB]! + .elseif \size == 1 + vst3.8 {d10[6], d11[6], d12[6]}, [RGB]! + .else + .error unsupported macroblock size + .endif + .elseif \bpp == 32 + .if \size == 8 + vst4.8 {d10, d11, d12, d13}, [RGB]! + .elseif \size == 4 + vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! + vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! + vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! + vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! + .elseif \size == 2 + vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! + vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! + .elseif \size == 1 + vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! + .else + .error unsupported macroblock size + .endif + .elseif \bpp == 16 + .if \size == 8 + vst1.16 {q15}, [RGB]! + .elseif \size == 4 + vst1.16 {d30}, [RGB]! + .elseif \size == 2 + vst1.16 {d31[0]}, [RGB]! + vst1.16 {d31[1]}, [RGB]! + .elseif \size == 1 + vst1.16 {d31[2]}, [RGB]! + .else + .error unsupported macroblock size + .endif + .else + .error unsupported bpp + .endif +.endm + +.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs + +/* + * 2-stage pipelined YCbCr->RGB conversion + */ + +.macro do_yuv_to_rgb_stage1 + vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ + vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ + vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ + vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ + vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ + vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ + vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ + vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ + vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ + vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ +.endm + +.macro do_yuv_to_rgb_stage2 + vrshrn.s32 d20, q10, #15 + vrshrn.s32 d21, q11, #15 + vrshrn.s32 d24, q12, #14 + vrshrn.s32 d25, q13, #14 + vrshrn.s32 d28, q14, #14 + vrshrn.s32 d29, q15, #14 + vaddw.u8 q11, q10, d0 + vaddw.u8 q12, q12, d0 + vaddw.u8 q14, q14, d0 + .if \bpp != 16 + vqmovun.s16 d1\g_offs, q11 + vqmovun.s16 d1\r_offs, q12 + vqmovun.s16 d1\b_offs, q14 + .else /* rgb565 */ + vqshlu.s16 q13, q11, #8 + vqshlu.s16 q15, q12, #8 + vqshlu.s16 q14, q14, #8 + vsri.u16 q15, q13, #5 + vsri.u16 q15, q14, #11 + .endif +.endm + +.macro do_yuv_to_rgb_stage2_store_load_stage1 + /* "do_yuv_to_rgb_stage2" and "store" */ + vrshrn.s32 d20, q10, #15 + /* "load" and "do_yuv_to_rgb_stage1" */ + pld [U, #64] + vrshrn.s32 d21, q11, #15 + pld [V, #64] + vrshrn.s32 d24, q12, #14 + vrshrn.s32 d25, q13, #14 + vld1.8 {d4}, [U, :64]! + vrshrn.s32 d28, q14, #14 + vld1.8 {d5}, [V, :64]! + vrshrn.s32 d29, q15, #14 + vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ + vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ + vaddw.u8 q11, q10, d0 + vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ + vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ + vaddw.u8 q12, q12, d0 + vaddw.u8 q14, q14, d0 + .if \bpp != 16 /**************** rgb24/rgb32 ******************************/ + vqmovun.s16 d1\g_offs, q11 + pld [Y, #64] + vqmovun.s16 d1\r_offs, q12 + vld1.8 {d0}, [Y, :64]! + vqmovun.s16 d1\b_offs, q14 + vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ + vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ + do_store \bpp, 8 + vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ + vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ + vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ + vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ + .else /**************************** rgb565 ********************************/ + vqshlu.s16 q13, q11, #8 + pld [Y, #64] + vqshlu.s16 q15, q12, #8 + vqshlu.s16 q14, q14, #8 + vld1.8 {d0}, [Y, :64]! + vmull.s16 q11, d7, d1[1] + vmlal.s16 q11, d9, d1[2] + vsri.u16 q15, q13, #5 + vmull.s16 q12, d8, d1[0] + vsri.u16 q15, q14, #11 + vmull.s16 q13, d9, d1[0] + vmull.s16 q14, d6, d1[3] + do_store \bpp, 8 + vmull.s16 q15, d7, d1[3] + .endif +.endm + +.macro do_yuv_to_rgb + do_yuv_to_rgb_stage1 + do_yuv_to_rgb_stage2 +.endm + +/* Apple gas crashes on adrl, work around that by using adr. + * But this requires a copy of these constants for each function. + */ + +.balign 16 +jsimd_ycc_\colorid\()_neon_consts: + .short 0, 0, 0, 0 + .short 22971, -11277, -23401, 29033 + .short -128, -128, -128, -128 + .short -128, -128, -128, -128 + +asm_function jsimd_ycc_\colorid\()_convert_neon + OUTPUT_WIDTH .req r0 + INPUT_BUF .req r1 + INPUT_ROW .req r2 + OUTPUT_BUF .req r3 + NUM_ROWS .req r4 + + INPUT_BUF0 .req r5 + INPUT_BUF1 .req r6 + INPUT_BUF2 .req INPUT_BUF + + RGB .req r7 + Y .req r8 + U .req r9 + V .req r10 + N .req ip + + /* Load constants to d1, d2, d3 (d0 is just used for padding) */ + adr ip, jsimd_ycc_\colorid\()_neon_consts + vld1.16 {d0, d1, d2, d3}, [ip, :128] + + /* Save ARM registers and handle input arguments */ + push {r4, r5, r6, r7, r8, r9, r10, lr} + ldr NUM_ROWS, [sp, #(4 * 8)] + ldr INPUT_BUF0, [INPUT_BUF] + ldr INPUT_BUF1, [INPUT_BUF, #4] + ldr INPUT_BUF2, [INPUT_BUF, #8] + .unreq INPUT_BUF + + /* Save NEON registers */ + vpush {d8-d15} + + /* Initially set d10, d11, d12, d13 to 0xFF */ + vmov.u8 q5, #255 + vmov.u8 q6, #255 + + /* Outer loop over scanlines */ + cmp NUM_ROWS, #1 + blt 9f +0: + ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2] + ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2] + mov N, OUTPUT_WIDTH + ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2] + add INPUT_ROW, INPUT_ROW, #1 + ldr RGB, [OUTPUT_BUF], #4 + + /* Inner loop over pixels */ + subs N, N, #8 + blt 3f + do_load 8 + do_yuv_to_rgb_stage1 + subs N, N, #8 + blt 2f +1: + do_yuv_to_rgb_stage2_store_load_stage1 + subs N, N, #8 + bge 1b +2: + do_yuv_to_rgb_stage2 + do_store \bpp, 8 + tst N, #7 + beq 8f +3: + tst N, #4 + beq 3f + do_load 4 +3: + tst N, #2 + beq 4f + do_load 2 +4: + tst N, #1 + beq 5f + do_load 1 +5: + do_yuv_to_rgb + tst N, #4 + beq 6f + do_store \bpp, 4 +6: + tst N, #2 + beq 7f + do_store \bpp, 2 +7: + tst N, #1 + beq 8f + do_store \bpp, 1 +8: + subs NUM_ROWS, NUM_ROWS, #1 + bgt 0b +9: + /* Restore all registers and return */ + vpop {d8-d15} + pop {r4, r5, r6, r7, r8, r9, r10, pc} + + .unreq OUTPUT_WIDTH + .unreq INPUT_ROW + .unreq OUTPUT_BUF + .unreq NUM_ROWS + .unreq INPUT_BUF0 + .unreq INPUT_BUF1 + .unreq INPUT_BUF2 + .unreq RGB + .unreq Y + .unreq U + .unreq V + .unreq N + +.purgem do_yuv_to_rgb +.purgem do_yuv_to_rgb_stage1 +.purgem do_yuv_to_rgb_stage2 +.purgem do_yuv_to_rgb_stage2_store_load_stage1 + +.endm + +/*--------------------------------- id ----- bpp R G B */ +generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 +generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0 +generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 +generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 +generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 +generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 +generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, 0, 0 + +.purgem do_load +.purgem do_store + + +/*****************************************************************************/ + +/* + * jsimd_extrgb_ycc_convert_neon + * jsimd_extbgr_ycc_convert_neon + * jsimd_extrgbx_ycc_convert_neon + * jsimd_extbgrx_ycc_convert_neon + * jsimd_extxbgr_ycc_convert_neon + * jsimd_extxrgb_ycc_convert_neon + * + * Colorspace conversion RGB -> YCbCr + */ + +.macro do_store size + .if \size == 8 + vst1.8 {d20}, [Y]! + vst1.8 {d21}, [U]! + vst1.8 {d22}, [V]! + .elseif \size == 4 + vst1.8 {d20[0]}, [Y]! + vst1.8 {d20[1]}, [Y]! + vst1.8 {d20[2]}, [Y]! + vst1.8 {d20[3]}, [Y]! + vst1.8 {d21[0]}, [U]! + vst1.8 {d21[1]}, [U]! + vst1.8 {d21[2]}, [U]! + vst1.8 {d21[3]}, [U]! + vst1.8 {d22[0]}, [V]! + vst1.8 {d22[1]}, [V]! + vst1.8 {d22[2]}, [V]! + vst1.8 {d22[3]}, [V]! + .elseif \size == 2 + vst1.8 {d20[4]}, [Y]! + vst1.8 {d20[5]}, [Y]! + vst1.8 {d21[4]}, [U]! + vst1.8 {d21[5]}, [U]! + vst1.8 {d22[4]}, [V]! + vst1.8 {d22[5]}, [V]! + .elseif \size == 1 + vst1.8 {d20[6]}, [Y]! + vst1.8 {d21[6]}, [U]! + vst1.8 {d22[6]}, [V]! + .else + .error unsupported macroblock size + .endif +.endm + +.macro do_load bpp, size + .if \bpp == 24 + .if \size == 8 + vld3.8 {d10, d11, d12}, [RGB]! + pld [RGB, #128] + .elseif \size == 4 + vld3.8 {d10[0], d11[0], d12[0]}, [RGB]! + vld3.8 {d10[1], d11[1], d12[1]}, [RGB]! + vld3.8 {d10[2], d11[2], d12[2]}, [RGB]! + vld3.8 {d10[3], d11[3], d12[3]}, [RGB]! + .elseif \size == 2 + vld3.8 {d10[4], d11[4], d12[4]}, [RGB]! + vld3.8 {d10[5], d11[5], d12[5]}, [RGB]! + .elseif \size == 1 + vld3.8 {d10[6], d11[6], d12[6]}, [RGB]! + .else + .error unsupported macroblock size + .endif + .elseif \bpp == 32 + .if \size == 8 + vld4.8 {d10, d11, d12, d13}, [RGB]! + pld [RGB, #128] + .elseif \size == 4 + vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! + vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! + vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! + vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! + .elseif \size == 2 + vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! + vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! + .elseif \size == 1 + vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! + .else + .error unsupported macroblock size + .endif + .else + .error unsupported bpp + .endif +.endm + +.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs + +/* + * 2-stage pipelined RGB->YCbCr conversion + */ + +.macro do_rgb_to_yuv_stage1 + vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ + vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ + vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ + vmull.u16 q7, d4, d0[0] + vmlal.u16 q7, d6, d0[1] + vmlal.u16 q7, d8, d0[2] + vmull.u16 q8, d5, d0[0] + vmlal.u16 q8, d7, d0[1] + vmlal.u16 q8, d9, d0[2] + vrev64.32 q9, q1 + vrev64.32 q13, q1 + vmlsl.u16 q9, d4, d0[3] + vmlsl.u16 q9, d6, d1[0] + vmlal.u16 q9, d8, d1[1] + vmlsl.u16 q13, d5, d0[3] + vmlsl.u16 q13, d7, d1[0] + vmlal.u16 q13, d9, d1[1] + vrev64.32 q14, q1 + vrev64.32 q15, q1 + vmlal.u16 q14, d4, d1[1] + vmlsl.u16 q14, d6, d1[2] + vmlsl.u16 q14, d8, d1[3] + vmlal.u16 q15, d5, d1[1] + vmlsl.u16 q15, d7, d1[2] + vmlsl.u16 q15, d9, d1[3] +.endm + +.macro do_rgb_to_yuv_stage2 + vrshrn.u32 d20, q7, #16 + vrshrn.u32 d21, q8, #16 + vshrn.u32 d22, q9, #16 + vshrn.u32 d23, q13, #16 + vshrn.u32 d24, q14, #16 + vshrn.u32 d25, q15, #16 + vmovn.u16 d20, q10 /* d20 = y */ + vmovn.u16 d21, q11 /* d21 = u */ + vmovn.u16 d22, q12 /* d22 = v */ +.endm + +.macro do_rgb_to_yuv + do_rgb_to_yuv_stage1 + do_rgb_to_yuv_stage2 +.endm + +.macro do_rgb_to_yuv_stage2_store_load_stage1 + vrshrn.u32 d20, q7, #16 + vrshrn.u32 d21, q8, #16 + vshrn.u32 d22, q9, #16 + vrev64.32 q9, q1 + vshrn.u32 d23, q13, #16 + vrev64.32 q13, q1 + vshrn.u32 d24, q14, #16 + vshrn.u32 d25, q15, #16 + do_load \bpp, 8 + vmovn.u16 d20, q10 /* d20 = y */ + vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ + vmovn.u16 d21, q11 /* d21 = u */ + vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ + vmovn.u16 d22, q12 /* d22 = v */ + vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ + vmull.u16 q7, d4, d0[0] + vmlal.u16 q7, d6, d0[1] + vmlal.u16 q7, d8, d0[2] + vst1.8 {d20}, [Y]! + vmull.u16 q8, d5, d0[0] + vmlal.u16 q8, d7, d0[1] + vmlal.u16 q8, d9, d0[2] + vmlsl.u16 q9, d4, d0[3] + vmlsl.u16 q9, d6, d1[0] + vmlal.u16 q9, d8, d1[1] + vst1.8 {d21}, [U]! + vmlsl.u16 q13, d5, d0[3] + vmlsl.u16 q13, d7, d1[0] + vmlal.u16 q13, d9, d1[1] + vrev64.32 q14, q1 + vrev64.32 q15, q1 + vmlal.u16 q14, d4, d1[1] + vmlsl.u16 q14, d6, d1[2] + vmlsl.u16 q14, d8, d1[3] + vst1.8 {d22}, [V]! + vmlal.u16 q15, d5, d1[1] + vmlsl.u16 q15, d7, d1[2] + vmlsl.u16 q15, d9, d1[3] +.endm + +.balign 16 +jsimd_\colorid\()_ycc_neon_consts: + .short 19595, 38470, 7471, 11059 + .short 21709, 32768, 27439, 5329 + .short 32767, 128, 32767, 128 + .short 32767, 128, 32767, 128 + +asm_function jsimd_\colorid\()_ycc_convert_neon + OUTPUT_WIDTH .req r0 + INPUT_BUF .req r1 + OUTPUT_BUF .req r2 + OUTPUT_ROW .req r3 + NUM_ROWS .req r4 + + OUTPUT_BUF0 .req r5 + OUTPUT_BUF1 .req r6 + OUTPUT_BUF2 .req OUTPUT_BUF + + RGB .req r7 + Y .req r8 + U .req r9 + V .req r10 + N .req ip + + /* Load constants to d0, d1, d2, d3 */ + adr ip, jsimd_\colorid\()_ycc_neon_consts + vld1.16 {d0, d1, d2, d3}, [ip, :128] + + /* Save ARM registers and handle input arguments */ + push {r4, r5, r6, r7, r8, r9, r10, lr} + ldr NUM_ROWS, [sp, #(4 * 8)] + ldr OUTPUT_BUF0, [OUTPUT_BUF] + ldr OUTPUT_BUF1, [OUTPUT_BUF, #4] + ldr OUTPUT_BUF2, [OUTPUT_BUF, #8] + .unreq OUTPUT_BUF + + /* Save NEON registers */ + vpush {d8-d15} + + /* Outer loop over scanlines */ + cmp NUM_ROWS, #1 + blt 9f +0: + ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2] + ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2] + mov N, OUTPUT_WIDTH + ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2] + add OUTPUT_ROW, OUTPUT_ROW, #1 + ldr RGB, [INPUT_BUF], #4 + + /* Inner loop over pixels */ + subs N, N, #8 + blt 3f + do_load \bpp, 8 + do_rgb_to_yuv_stage1 + subs N, N, #8 + blt 2f +1: + do_rgb_to_yuv_stage2_store_load_stage1 + subs N, N, #8 + bge 1b +2: + do_rgb_to_yuv_stage2 + do_store 8 + tst N, #7 + beq 8f +3: + tst N, #4 + beq 3f + do_load \bpp, 4 +3: + tst N, #2 + beq 4f + do_load \bpp, 2 +4: + tst N, #1 + beq 5f + do_load \bpp, 1 +5: + do_rgb_to_yuv + tst N, #4 + beq 6f + do_store 4 +6: + tst N, #2 + beq 7f + do_store 2 +7: + tst N, #1 + beq 8f + do_store 1 +8: + subs NUM_ROWS, NUM_ROWS, #1 + bgt 0b +9: + /* Restore all registers and return */ + vpop {d8-d15} + pop {r4, r5, r6, r7, r8, r9, r10, pc} + + .unreq OUTPUT_WIDTH + .unreq OUTPUT_ROW + .unreq INPUT_BUF + .unreq NUM_ROWS + .unreq OUTPUT_BUF0 + .unreq OUTPUT_BUF1 + .unreq OUTPUT_BUF2 + .unreq RGB + .unreq Y + .unreq U + .unreq V + .unreq N + +.purgem do_rgb_to_yuv +.purgem do_rgb_to_yuv_stage1 +.purgem do_rgb_to_yuv_stage2 +.purgem do_rgb_to_yuv_stage2_store_load_stage1 + +.endm + +/*--------------------------------- id ----- bpp R G B */ +generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2 +generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0 +generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2 +generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0 +generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1 +generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3 + +.purgem do_load +.purgem do_store + + +/*****************************************************************************/ + +/* + * Load data into workspace, applying unsigned->signed conversion + * + * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get + * rid of VST1.16 instructions + */ + +asm_function jsimd_convsamp_neon + SAMPLE_DATA .req r0 + START_COL .req r1 + WORKSPACE .req r2 + TMP1 .req r3 + TMP2 .req r4 + TMP3 .req r5 + TMP4 .req ip + + push {r4, r5} + vmov.u8 d0, #128 + + ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} + add TMP1, TMP1, START_COL + add TMP2, TMP2, START_COL + add TMP3, TMP3, START_COL + add TMP4, TMP4, START_COL + vld1.8 {d16}, [TMP1] + vsubl.u8 q8, d16, d0 + vld1.8 {d18}, [TMP2] + vsubl.u8 q9, d18, d0 + vld1.8 {d20}, [TMP3] + vsubl.u8 q10, d20, d0 + vld1.8 {d22}, [TMP4] + ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} + vsubl.u8 q11, d22, d0 + vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]! + add TMP1, TMP1, START_COL + add TMP2, TMP2, START_COL + vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]! + add TMP3, TMP3, START_COL + add TMP4, TMP4, START_COL + vld1.8 {d24}, [TMP1] + vsubl.u8 q12, d24, d0 + vld1.8 {d26}, [TMP2] + vsubl.u8 q13, d26, d0 + vld1.8 {d28}, [TMP3] + vsubl.u8 q14, d28, d0 + vld1.8 {d30}, [TMP4] + vsubl.u8 q15, d30, d0 + vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]! + vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]! + pop {r4, r5} + bx lr + + .unreq SAMPLE_DATA + .unreq START_COL + .unreq WORKSPACE + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + + +/*****************************************************************************/ + +/* + * jsimd_fdct_ifast_neon + * + * This function contains a fast, not so accurate integer implementation of + * the forward DCT (Discrete Cosine Transform). It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' + * function from jfdctfst.c + * + * TODO: can be combined with 'jsimd_convsamp_neon' to get + * rid of a bunch of VLD1.16 instructions + */ + +#define XFIX_0_382683433 d0[0] +#define XFIX_0_541196100 d0[1] +#define XFIX_0_707106781 d0[2] +#define XFIX_1_306562965 d0[3] + +.balign 16 +jsimd_fdct_ifast_neon_consts: + .short (98 * 128) /* XFIX_0_382683433 */ + .short (139 * 128) /* XFIX_0_541196100 */ + .short (181 * 128) /* XFIX_0_707106781 */ + .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ + +asm_function jsimd_fdct_ifast_neon + + DATA .req r0 + TMP .req ip + + vpush {d8-d15} + + /* Load constants */ + adr TMP, jsimd_fdct_ifast_neon_consts + vld1.16 {d0}, [TMP, :64] + + /* Load all DATA into NEON registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 | q8 + * 1 | d18 | d19 | q9 + * 2 | d20 | d21 | q10 + * 3 | d22 | d23 | q11 + * 4 | d24 | d25 | q12 + * 5 | d26 | d27 | q13 + * 6 | d28 | d29 | q14 + * 7 | d30 | d31 | q15 + */ + + vld1.16 {d16, d17, d18, d19}, [DATA, :128]! + vld1.16 {d20, d21, d22, d23}, [DATA, :128]! + vld1.16 {d24, d25, d26, d27}, [DATA, :128]! + vld1.16 {d28, d29, d30, d31}, [DATA, :128] + sub DATA, DATA, #(128 - 32) + + mov TMP, #2 +1: + /* Transpose */ + vtrn.16 q12, q13 + vtrn.16 q10, q11 + vtrn.16 q8, q9 + vtrn.16 q14, q15 + vtrn.32 q9, q11 + vtrn.32 q13, q15 + vtrn.32 q8, q10 + vtrn.32 q12, q14 + vswp d30, d23 + vswp d24, d17 + vswp d26, d19 + /* 1-D FDCT */ + vadd.s16 q2, q11, q12 + vswp d28, d21 + vsub.s16 q12, q11, q12 + vsub.s16 q6, q10, q13 + vadd.s16 q10, q10, q13 + vsub.s16 q7, q9, q14 + vadd.s16 q9, q9, q14 + vsub.s16 q1, q8, q15 + vadd.s16 q8, q8, q15 + vsub.s16 q4, q9, q10 + vsub.s16 q5, q8, q2 + vadd.s16 q3, q9, q10 + vadd.s16 q4, q4, q5 + vadd.s16 q2, q8, q2 + vqdmulh.s16 q4, q4, XFIX_0_707106781 + vadd.s16 q11, q12, q6 + vadd.s16 q8, q2, q3 + vsub.s16 q12, q2, q3 + vadd.s16 q3, q6, q7 + vadd.s16 q7, q7, q1 + vqdmulh.s16 q3, q3, XFIX_0_707106781 + vsub.s16 q6, q11, q7 + vadd.s16 q10, q5, q4 + vqdmulh.s16 q6, q6, XFIX_0_382683433 + vsub.s16 q14, q5, q4 + vqdmulh.s16 q11, q11, XFIX_0_541196100 + vqdmulh.s16 q5, q7, XFIX_1_306562965 + vadd.s16 q4, q1, q3 + vsub.s16 q3, q1, q3 + vadd.s16 q7, q7, q6 + vadd.s16 q11, q11, q6 + vadd.s16 q7, q7, q5 + vadd.s16 q13, q3, q11 + vsub.s16 q11, q3, q11 + vadd.s16 q9, q4, q7 + vsub.s16 q15, q4, q7 + subs TMP, TMP, #1 + bne 1b + + /* store results */ + vst1.16 {d16, d17, d18, d19}, [DATA, :128]! + vst1.16 {d20, d21, d22, d23}, [DATA, :128]! + vst1.16 {d24, d25, d26, d27}, [DATA, :128]! + vst1.16 {d28, d29, d30, d31}, [DATA, :128] + + vpop {d8-d15} + bx lr + + .unreq DATA + .unreq TMP + + +/*****************************************************************************/ + +/* + * GLOBAL(void) + * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors, + * DCTELEM *workspace); + * + * Note: the code uses 2 stage pipelining in order to improve instructions + * scheduling and eliminate stalls (this provides ~15% better + * performance for this function on both ARM Cortex-A8 and + * ARM Cortex-A9 when compared to the non-pipelined variant). + * The instructions which belong to the second stage use different + * indentation for better readiability. + */ +asm_function jsimd_quantize_neon + + COEF_BLOCK .req r0 + DIVISORS .req r1 + WORKSPACE .req r2 + + RECIPROCAL .req DIVISORS + CORRECTION .req r3 + SHIFT .req ip + LOOP_COUNT .req r4 + + vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! + vabs.s16 q12, q0 + add CORRECTION, DIVISORS, #(64 * 2) + add SHIFT, DIVISORS, #(64 * 6) + vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! + vabs.s16 q13, q1 + vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! + vadd.u16 q12, q12, q10 /* add correction */ + vadd.u16 q13, q13, q11 + vmull.u16 q10, d24, d16 /* multiply by reciprocal */ + vmull.u16 q11, d25, d17 + vmull.u16 q8, d26, d18 + vmull.u16 q9, d27, d19 + vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! + vshrn.u32 d20, q10, #16 + vshrn.u32 d21, q11, #16 + vshrn.u32 d22, q8, #16 + vshrn.u32 d23, q9, #16 + vneg.s16 q12, q12 + vneg.s16 q13, q13 + vshr.s16 q2, q0, #15 /* extract sign */ + vshr.s16 q3, q1, #15 + vshl.u16 q14, q10, q12 /* shift */ + vshl.u16 q15, q11, q13 + + push {r4, r5} + mov LOOP_COUNT, #3 +1: + vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! + veor.u16 q14, q14, q2 /* restore sign */ + vabs.s16 q12, q0 + vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! + vabs.s16 q13, q1 + veor.u16 q15, q15, q3 + vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! + vadd.u16 q12, q12, q10 /* add correction */ + vadd.u16 q13, q13, q11 + vmull.u16 q10, d24, d16 /* multiply by reciprocal */ + vmull.u16 q11, d25, d17 + vmull.u16 q8, d26, d18 + vmull.u16 q9, d27, d19 + vsub.u16 q14, q14, q2 + vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! + vsub.u16 q15, q15, q3 + vshrn.u32 d20, q10, #16 + vshrn.u32 d21, q11, #16 + vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! + vshrn.u32 d22, q8, #16 + vshrn.u32 d23, q9, #16 + vneg.s16 q12, q12 + vneg.s16 q13, q13 + vshr.s16 q2, q0, #15 /* extract sign */ + vshr.s16 q3, q1, #15 + vshl.u16 q14, q10, q12 /* shift */ + vshl.u16 q15, q11, q13 + subs LOOP_COUNT, LOOP_COUNT, #1 + bne 1b + pop {r4, r5} + + veor.u16 q14, q14, q2 /* restore sign */ + veor.u16 q15, q15, q3 + vsub.u16 q14, q14, q2 + vsub.u16 q15, q15, q3 + vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! + + bx lr /* return */ + + .unreq COEF_BLOCK + .unreq DIVISORS + .unreq WORKSPACE + .unreq RECIPROCAL + .unreq CORRECTION + .unreq SHIFT + .unreq LOOP_COUNT + + +/*****************************************************************************/ + +/* + * GLOBAL(void) + * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor, + * JDIMENSION downsampled_width, + * JSAMPARRAY input_data, + * JSAMPARRAY *output_data_ptr); + * + * Note: the use of unaligned writes is the main remaining bottleneck in + * this code, which can be potentially solved to get up to tens + * of percents performance improvement on Cortex-A8/Cortex-A9. + */ + +/* + * Upsample 16 source pixels to 32 destination pixels. The new 16 source + * pixels are loaded to q0. The previous 16 source pixels are in q1. The + * shifted-by-one source pixels are constructed in q2 by using q0 and q1. + * Register d28 is used for multiplication by 3. Register q15 is used + * for adding +1 bias. + */ +.macro upsample16 OUTPTR, INPTR + vld1.8 {q0}, [\INPTR]! + vmovl.u8 q8, d0 + vext.8 q2, q1, q0, #15 + vmovl.u8 q9, d1 + vaddw.u8 q10, q15, d4 + vaddw.u8 q11, q15, d5 + vmlal.u8 q8, d4, d28 + vmlal.u8 q9, d5, d28 + vmlal.u8 q10, d0, d28 + vmlal.u8 q11, d1, d28 + vmov q1, q0 /* backup source pixels to q1 */ + vrshrn.u16 d6, q8, #2 + vrshrn.u16 d7, q9, #2 + vshrn.u16 d8, q10, #2 + vshrn.u16 d9, q11, #2 + vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! +.endm + +/* + * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16' + * macro, the roles of q0 and q1 registers are reversed for even and odd + * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed. + * Also this unrolling allows to reorder loads and stores to compensate + * multiplication latency and reduce stalls. + */ +.macro upsample32 OUTPTR, INPTR + /* even 16 pixels group */ + vld1.8 {q0}, [\INPTR]! + vmovl.u8 q8, d0 + vext.8 q2, q1, q0, #15 + vmovl.u8 q9, d1 + vaddw.u8 q10, q15, d4 + vaddw.u8 q11, q15, d5 + vmlal.u8 q8, d4, d28 + vmlal.u8 q9, d5, d28 + vmlal.u8 q10, d0, d28 + vmlal.u8 q11, d1, d28 + /* odd 16 pixels group */ + vld1.8 {q1}, [\INPTR]! + vrshrn.u16 d6, q8, #2 + vrshrn.u16 d7, q9, #2 + vshrn.u16 d8, q10, #2 + vshrn.u16 d9, q11, #2 + vmovl.u8 q8, d2 + vext.8 q2, q0, q1, #15 + vmovl.u8 q9, d3 + vaddw.u8 q10, q15, d4 + vaddw.u8 q11, q15, d5 + vmlal.u8 q8, d4, d28 + vmlal.u8 q9, d5, d28 + vmlal.u8 q10, d2, d28 + vmlal.u8 q11, d3, d28 + vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! + vrshrn.u16 d6, q8, #2 + vrshrn.u16 d7, q9, #2 + vshrn.u16 d8, q10, #2 + vshrn.u16 d9, q11, #2 + vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! +.endm + +/* + * Upsample a row of WIDTH pixels from INPTR to OUTPTR. + */ +.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1 + /* special case for the first and last pixels */ + sub \WIDTH, \WIDTH, #1 + add \OUTPTR, \OUTPTR, #1 + ldrb \TMP1, [\INPTR, \WIDTH] + strb \TMP1, [\OUTPTR, \WIDTH, asl #1] + ldrb \TMP1, [\INPTR], #1 + strb \TMP1, [\OUTPTR, #-1] + vmov.8 d3[7], \TMP1 + + subs \WIDTH, \WIDTH, #32 + blt 5f +0: /* process 32 pixels per iteration */ + upsample32 \OUTPTR, \INPTR + subs \WIDTH, \WIDTH, #32 + bge 0b +5: + adds \WIDTH, \WIDTH, #16 + blt 1f +0: /* process 16 pixels if needed */ + upsample16 \OUTPTR, \INPTR + subs \WIDTH, \WIDTH, #16 +1: + adds \WIDTH, \WIDTH, #16 + beq 9f + + /* load the remaining 1-15 pixels */ + add \INPTR, \INPTR, \WIDTH + tst \WIDTH, #1 + beq 2f + sub \INPTR, \INPTR, #1 + vld1.8 {d0[0]}, [\INPTR] +2: + tst \WIDTH, #2 + beq 2f + vext.8 d0, d0, d0, #6 + sub \INPTR, \INPTR, #1 + vld1.8 {d0[1]}, [\INPTR] + sub \INPTR, \INPTR, #1 + vld1.8 {d0[0]}, [\INPTR] +2: + tst \WIDTH, #4 + beq 2f + vrev64.32 d0, d0 + sub \INPTR, \INPTR, #1 + vld1.8 {d0[3]}, [\INPTR] + sub \INPTR, \INPTR, #1 + vld1.8 {d0[2]}, [\INPTR] + sub \INPTR, \INPTR, #1 + vld1.8 {d0[1]}, [\INPTR] + sub \INPTR, \INPTR, #1 + vld1.8 {d0[0]}, [\INPTR] +2: + tst \WIDTH, #8 + beq 2f + vmov d1, d0 + sub \INPTR, \INPTR, #8 + vld1.8 {d0}, [\INPTR] +2: /* upsample the remaining pixels */ + vmovl.u8 q8, d0 + vext.8 q2, q1, q0, #15 + vmovl.u8 q9, d1 + vaddw.u8 q10, q15, d4 + vaddw.u8 q11, q15, d5 + vmlal.u8 q8, d4, d28 + vmlal.u8 q9, d5, d28 + vmlal.u8 q10, d0, d28 + vmlal.u8 q11, d1, d28 + vrshrn.u16 d10, q8, #2 + vrshrn.u16 d12, q9, #2 + vshrn.u16 d11, q10, #2 + vshrn.u16 d13, q11, #2 + vzip.8 d10, d11 + vzip.8 d12, d13 + /* store the remaining pixels */ + tst \WIDTH, #8 + beq 2f + vst1.8 {d10, d11}, [\OUTPTR]! + vmov q5, q6 +2: + tst \WIDTH, #4 + beq 2f + vst1.8 {d10}, [\OUTPTR]! + vmov d10, d11 +2: + tst \WIDTH, #2 + beq 2f + vst1.8 {d10[0]}, [\OUTPTR]! + vst1.8 {d10[1]}, [\OUTPTR]! + vst1.8 {d10[2]}, [\OUTPTR]! + vst1.8 {d10[3]}, [\OUTPTR]! + vext.8 d10, d10, d10, #4 +2: + tst \WIDTH, #1 + beq 2f + vst1.8 {d10[0]}, [\OUTPTR]! + vst1.8 {d10[1]}, [\OUTPTR]! +2: +9: +.endm + +asm_function jsimd_h2v1_fancy_upsample_neon + + MAX_V_SAMP_FACTOR .req r0 + DOWNSAMPLED_WIDTH .req r1 + INPUT_DATA .req r2 + OUTPUT_DATA_PTR .req r3 + OUTPUT_DATA .req OUTPUT_DATA_PTR + + OUTPTR .req r4 + INPTR .req r5 + WIDTH .req ip + TMP .req lr + + push {r4, r5, r6, lr} + vpush {d8-d15} + + ldr OUTPUT_DATA, [OUTPUT_DATA_PTR] + cmp MAX_V_SAMP_FACTOR, #0 + ble 99f + + /* initialize constants */ + vmov.u8 d28, #3 + vmov.u16 q15, #1 +11: + ldr INPTR, [INPUT_DATA], #4 + ldr OUTPTR, [OUTPUT_DATA], #4 + mov WIDTH, DOWNSAMPLED_WIDTH + upsample_row OUTPTR, INPTR, WIDTH, TMP + subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1 + bgt 11b + +99: + vpop {d8-d15} + pop {r4, r5, r6, pc} + + .unreq MAX_V_SAMP_FACTOR + .unreq DOWNSAMPLED_WIDTH + .unreq INPUT_DATA + .unreq OUTPUT_DATA_PTR + .unreq OUTPUT_DATA + + .unreq OUTPTR + .unreq INPTR + .unreq WIDTH + .unreq TMP + +.purgem upsample16 +.purgem upsample32 +.purgem upsample_row + + +/*****************************************************************************/ + +/* + * GLOBAL(JOCTET*) + * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer, + * JCOEFPTR block, int last_dc_val, + * c_derived_tbl *dctbl, c_derived_tbl *actbl) + * + */ + +.macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP + sub \PUT_BITS, \PUT_BITS, #0x8 + lsr \TMP, \PUT_BUFFER, \PUT_BITS + uxtb \TMP, \TMP + strb \TMP, [\BUFFER, #1]! + cmp \TMP, #0xff + /*it eq*/ + strbeq \ZERO, [\BUFFER, #1]! +.endm + +.macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE + /*lsl \PUT_BUFFER, \PUT_BUFFER, \SIZE*/ + add \PUT_BITS, \SIZE + /*orr \PUT_BUFFER, \PUT_BUFFER, \CODE*/ + orr \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE +.endm + +.macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP + cmp \PUT_BITS, #0x10 + blt 15f + eor \ZERO, \ZERO, \ZERO + emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP + emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP +15: +.endm + +.balign 16 +jsimd_huff_encode_one_block_neon_consts: + .byte 0x01 + .byte 0x02 + .byte 0x04 + .byte 0x08 + .byte 0x10 + .byte 0x20 + .byte 0x40 + .byte 0x80 + +asm_function jsimd_huff_encode_one_block_neon + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + add r7, sp, #0x1c + sub r4, sp, #0x40 + bfc r4, #0, #5 + mov sp, r4 /* align sp on 32 bytes */ + vst1.64 {d8, d9, d10, d11}, [r4, :128]! + vst1.64 {d12, d13, d14, d15}, [r4, :128] + sub sp, #0x140 /* reserve 320 bytes */ + str r0, [sp, #0x18] /* working state > sp + Ox18 */ + add r4, sp, #0x20 /* r4 = t1 */ + ldr lr, [r7, #0x8] /* lr = dctbl */ + sub r10, r1, #0x1 /* r10=buffer-- */ + ldrsh r1, [r2] + mov r9, #0x10 + mov r8, #0x1 + adr r5, jsimd_huff_encode_one_block_neon_consts + /* prepare data */ + vld1.8 {d26}, [r5, :64] + veor q8, q8, q8 + veor q9, q9, q9 + vdup.16 q14, r9 + vdup.16 q15, r8 + veor q10, q10, q10 + veor q11, q11, q11 + sub r1, r1, r3 + add r9, r2, #0x22 + add r8, r2, #0x18 + add r3, r2, #0x36 + vmov.16 d0[0], r1 + vld1.16 {d2[0]}, [r9, :16] + vld1.16 {d4[0]}, [r8, :16] + vld1.16 {d6[0]}, [r3, :16] + add r1, r2, #0x2 + add r9, r2, #0x30 + add r8, r2, #0x26 + add r3, r2, #0x28 + vld1.16 {d0[1]}, [r1, :16] + vld1.16 {d2[1]}, [r9, :16] + vld1.16 {d4[1]}, [r8, :16] + vld1.16 {d6[1]}, [r3, :16] + add r1, r2, #0x10 + add r9, r2, #0x40 + add r8, r2, #0x34 + add r3, r2, #0x1a + vld1.16 {d0[2]}, [r1, :16] + vld1.16 {d2[2]}, [r9, :16] + vld1.16 {d4[2]}, [r8, :16] + vld1.16 {d6[2]}, [r3, :16] + add r1, r2, #0x20 + add r9, r2, #0x32 + add r8, r2, #0x42 + add r3, r2, #0xc + vld1.16 {d0[3]}, [r1, :16] + vld1.16 {d2[3]}, [r9, :16] + vld1.16 {d4[3]}, [r8, :16] + vld1.16 {d6[3]}, [r3, :16] + add r1, r2, #0x12 + add r9, r2, #0x24 + add r8, r2, #0x50 + add r3, r2, #0xe + vld1.16 {d1[0]}, [r1, :16] + vld1.16 {d3[0]}, [r9, :16] + vld1.16 {d5[0]}, [r8, :16] + vld1.16 {d7[0]}, [r3, :16] + add r1, r2, #0x4 + add r9, r2, #0x16 + add r8, r2, #0x60 + add r3, r2, #0x1c + vld1.16 {d1[1]}, [r1, :16] + vld1.16 {d3[1]}, [r9, :16] + vld1.16 {d5[1]}, [r8, :16] + vld1.16 {d7[1]}, [r3, :16] + add r1, r2, #0x6 + add r9, r2, #0x8 + add r8, r2, #0x52 + add r3, r2, #0x2a + vld1.16 {d1[2]}, [r1, :16] + vld1.16 {d3[2]}, [r9, :16] + vld1.16 {d5[2]}, [r8, :16] + vld1.16 {d7[2]}, [r3, :16] + add r1, r2, #0x14 + add r9, r2, #0xa + add r8, r2, #0x44 + add r3, r2, #0x38 + vld1.16 {d1[3]}, [r1, :16] + vld1.16 {d3[3]}, [r9, :16] + vld1.16 {d5[3]}, [r8, :16] + vld1.16 {d7[3]}, [r3, :16] + vcgt.s16 q8, q8, q0 + vcgt.s16 q9, q9, q1 + vcgt.s16 q10, q10, q2 + vcgt.s16 q11, q11, q3 + vabs.s16 q0, q0 + vabs.s16 q1, q1 + vabs.s16 q2, q2 + vabs.s16 q3, q3 + veor q8, q8, q0 + veor q9, q9, q1 + veor q10, q10, q2 + veor q11, q11, q3 + add r9, r4, #0x20 + add r8, r4, #0x80 + add r3, r4, #0xa0 + vclz.i16 q0, q0 + vclz.i16 q1, q1 + vclz.i16 q2, q2 + vclz.i16 q3, q3 + vsub.i16 q0, q14, q0 + vsub.i16 q1, q14, q1 + vsub.i16 q2, q14, q2 + vsub.i16 q3, q14, q3 + vst1.16 {d0, d1, d2, d3}, [r4, :256] + vst1.16 {d4, d5, d6, d7}, [r9, :256] + vshl.s16 q0, q15, q0 + vshl.s16 q1, q15, q1 + vshl.s16 q2, q15, q2 + vshl.s16 q3, q15, q3 + vsub.i16 q0, q0, q15 + vsub.i16 q1, q1, q15 + vsub.i16 q2, q2, q15 + vsub.i16 q3, q3, q15 + vand q8, q8, q0 + vand q9, q9, q1 + vand q10, q10, q2 + vand q11, q11, q3 + vst1.16 {d16, d17, d18, d19}, [r8, :256] + vst1.16 {d20, d21, d22, d23}, [r3, :256] + add r1, r2, #0x46 + add r9, r2, #0x3a + add r8, r2, #0x74 + add r3, r2, #0x6a + vld1.16 {d8[0]}, [r1, :16] + vld1.16 {d10[0]}, [r9, :16] + vld1.16 {d12[0]}, [r8, :16] + vld1.16 {d14[0]}, [r3, :16] + veor q8, q8, q8 + veor q9, q9, q9 + veor q10, q10, q10 + veor q11, q11, q11 + add r1, r2, #0x54 + add r9, r2, #0x2c + add r8, r2, #0x76 + add r3, r2, #0x78 + vld1.16 {d8[1]}, [r1, :16] + vld1.16 {d10[1]}, [r9, :16] + vld1.16 {d12[1]}, [r8, :16] + vld1.16 {d14[1]}, [r3, :16] + add r1, r2, #0x62 + add r9, r2, #0x1e + add r8, r2, #0x68 + add r3, r2, #0x7a + vld1.16 {d8[2]}, [r1, :16] + vld1.16 {d10[2]}, [r9, :16] + vld1.16 {d12[2]}, [r8, :16] + vld1.16 {d14[2]}, [r3, :16] + add r1, r2, #0x70 + add r9, r2, #0x2e + add r8, r2, #0x5a + add r3, r2, #0x6c + vld1.16 {d8[3]}, [r1, :16] + vld1.16 {d10[3]}, [r9, :16] + vld1.16 {d12[3]}, [r8, :16] + vld1.16 {d14[3]}, [r3, :16] + add r1, r2, #0x72 + add r9, r2, #0x3c + add r8, r2, #0x4c + add r3, r2, #0x5e + vld1.16 {d9[0]}, [r1, :16] + vld1.16 {d11[0]}, [r9, :16] + vld1.16 {d13[0]}, [r8, :16] + vld1.16 {d15[0]}, [r3, :16] + add r1, r2, #0x64 + add r9, r2, #0x4a + add r8, r2, #0x3e + add r3, r2, #0x6e + vld1.16 {d9[1]}, [r1, :16] + vld1.16 {d11[1]}, [r9, :16] + vld1.16 {d13[1]}, [r8, :16] + vld1.16 {d15[1]}, [r3, :16] + add r1, r2, #0x56 + add r9, r2, #0x58 + add r8, r2, #0x4e + add r3, r2, #0x7c + vld1.16 {d9[2]}, [r1, :16] + vld1.16 {d11[2]}, [r9, :16] + vld1.16 {d13[2]}, [r8, :16] + vld1.16 {d15[2]}, [r3, :16] + add r1, r2, #0x48 + add r9, r2, #0x66 + add r8, r2, #0x5c + add r3, r2, #0x7e + vld1.16 {d9[3]}, [r1, :16] + vld1.16 {d11[3]}, [r9, :16] + vld1.16 {d13[3]}, [r8, :16] + vld1.16 {d15[3]}, [r3, :16] + vcgt.s16 q8, q8, q4 + vcgt.s16 q9, q9, q5 + vcgt.s16 q10, q10, q6 + vcgt.s16 q11, q11, q7 + vabs.s16 q4, q4 + vabs.s16 q5, q5 + vabs.s16 q6, q6 + vabs.s16 q7, q7 + veor q8, q8, q4 + veor q9, q9, q5 + veor q10, q10, q6 + veor q11, q11, q7 + add r1, r4, #0x40 + add r9, r4, #0x60 + add r8, r4, #0xc0 + add r3, r4, #0xe0 + vclz.i16 q4, q4 + vclz.i16 q5, q5 + vclz.i16 q6, q6 + vclz.i16 q7, q7 + vsub.i16 q4, q14, q4 + vsub.i16 q5, q14, q5 + vsub.i16 q6, q14, q6 + vsub.i16 q7, q14, q7 + vst1.16 {d8, d9, d10, d11}, [r1, :256] + vst1.16 {d12, d13, d14, d15}, [r9, :256] + vshl.s16 q4, q15, q4 + vshl.s16 q5, q15, q5 + vshl.s16 q6, q15, q6 + vshl.s16 q7, q15, q7 + vsub.i16 q4, q4, q15 + vsub.i16 q5, q5, q15 + vsub.i16 q6, q6, q15 + vsub.i16 q7, q7, q15 + vand q8, q8, q4 + vand q9, q9, q5 + vand q10, q10, q6 + vand q11, q11, q7 + vst1.16 {d16, d17, d18, d19}, [r8, :256] + vst1.16 {d20, d21, d22, d23}, [r3, :256] + ldr r12, [r7, #0xc] /* r12 = actbl */ + add r1, lr, #0x400 /* r1 = dctbl->ehufsi */ + mov r9, r12 /* r9 = actbl */ + add r6, r4, #0x80 /* r6 = t2 */ + ldr r11, [r0, #0x8] /* r11 = put_buffer */ + ldr r4, [r0, #0xc] /* r4 = put_bits */ + ldrh r2, [r6, #-128] /* r2 = nbits */ + ldrh r3, [r6] /* r3 = temp2 & (((JLONG) 1)<<nbits) - 1; */ + ldr r0, [lr, r2, lsl #2] + ldrb r5, [r1, r2] + put_bits r11, r4, r0, r5 + checkbuf15 r10, r11, r4, r5, r0 + put_bits r11, r4, r3, r2 + checkbuf15 r10, r11, r4, r5, r0 + mov lr, r6 /* lr = t2 */ + add r5, r9, #0x400 /* r5 = actbl->ehufsi */ + ldrsb r6, [r5, #0xf0] /* r6 = actbl->ehufsi[0xf0] */ + veor q8, q8, q8 + vceq.i16 q0, q0, q8 + vceq.i16 q1, q1, q8 + vceq.i16 q2, q2, q8 + vceq.i16 q3, q3, q8 + vceq.i16 q4, q4, q8 + vceq.i16 q5, q5, q8 + vceq.i16 q6, q6, q8 + vceq.i16 q7, q7, q8 + vmovn.i16 d0, q0 + vmovn.i16 d2, q1 + vmovn.i16 d4, q2 + vmovn.i16 d6, q3 + vmovn.i16 d8, q4 + vmovn.i16 d10, q5 + vmovn.i16 d12, q6 + vmovn.i16 d14, q7 + vand d0, d0, d26 + vand d2, d2, d26 + vand d4, d4, d26 + vand d6, d6, d26 + vand d8, d8, d26 + vand d10, d10, d26 + vand d12, d12, d26 + vand d14, d14, d26 + vpadd.i8 d0, d0, d2 + vpadd.i8 d4, d4, d6 + vpadd.i8 d8, d8, d10 + vpadd.i8 d12, d12, d14 + vpadd.i8 d0, d0, d4 + vpadd.i8 d8, d8, d12 + vpadd.i8 d0, d0, d8 + vmov.32 r1, d0[1] + vmov.32 r8, d0[0] + mvn r1, r1 + mvn r8, r8 + lsrs r1, r1, #0x1 + rrx r8, r8 /* shift in last r1 bit while shifting out DC bit */ + rbit r1, r1 /* r1 = index1 */ + rbit r8, r8 /* r8 = index0 */ + ldr r0, [r9, #0x3c0] /* r0 = actbl->ehufco[0xf0] */ + str r1, [sp, #0x14] /* index1 > sp + 0x14 */ + cmp r8, #0x0 + beq 6f +1: + clz r2, r8 + add lr, lr, r2, lsl #1 + lsl r8, r8, r2 + ldrh r1, [lr, #-126] +2: + cmp r2, #0x10 + blt 3f + sub r2, r2, #0x10 + put_bits r11, r4, r0, r6 + cmp r4, #0x10 + blt 2b + eor r3, r3, r3 + emit_byte r10, r11, r4, r3, r12 + emit_byte r10, r11, r4, r3, r12 + b 2b +3: + add r2, r1, r2, lsl #4 + ldrh r3, [lr, #2]! + ldr r12, [r9, r2, lsl #2] + ldrb r2, [r5, r2] + put_bits r11, r4, r12, r2 + checkbuf15 r10, r11, r4, r2, r12 + put_bits r11, r4, r3, r1 + checkbuf15 r10, r11, r4, r2, r12 + lsls r8, r8, #0x1 + bne 1b +6: + add r12, sp, #0x20 /* r12 = t1 */ + ldr r8, [sp, #0x14] /* r8 = index1 */ + adds r12, #0xc0 /* r12 = t2 + (DCTSIZE2/2) */ + cmp r8, #0x0 + beq 6f + clz r2, r8 + sub r12, r12, lr + lsl r8, r8, r2 + add r2, r2, r12, lsr #1 + add lr, lr, r2, lsl #1 + b 7f +1: + clz r2, r8 + add lr, lr, r2, lsl #1 + lsl r8, r8, r2 +7: + ldrh r1, [lr, #-126] +2: + cmp r2, #0x10 + blt 3f + sub r2, r2, #0x10 + put_bits r11, r4, r0, r6 + cmp r4, #0x10 + blt 2b + eor r3, r3, r3 + emit_byte r10, r11, r4, r3, r12 + emit_byte r10, r11, r4, r3, r12 + b 2b +3: + add r2, r1, r2, lsl #4 + ldrh r3, [lr, #2]! + ldr r12, [r9, r2, lsl #2] + ldrb r2, [r5, r2] + put_bits r11, r4, r12, r2 + checkbuf15 r10, r11, r4, r2, r12 + put_bits r11, r4, r3, r1 + checkbuf15 r10, r11, r4, r2, r12 + lsls r8, r8, #0x1 + bne 1b +6: + add r0, sp, #0x20 + add r0, #0xfe + cmp lr, r0 + bhs 1f + ldr r1, [r9] + ldrb r0, [r5] + put_bits r11, r4, r1, r0 + checkbuf15 r10, r11, r4, r0, r1 +1: + ldr r12, [sp, #0x18] + str r11, [r12, #0x8] + str r4, [r12, #0xc] + add r0, r10, #0x1 + add r4, sp, #0x140 + vld1.64 {d8, d9, d10, d11}, [r4, :128]! + vld1.64 {d12, d13, d14, d15}, [r4, :128] + sub r4, r7, #0x1c + mov sp, r4 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + +.purgem emit_byte +.purgem put_bits +.purgem checkbuf15 diff --git a/media/libjpeg/simd/jsimd_i386.c b/media/libjpeg/simd/jsimd_i386.c new file mode 100644 index 000000000..6da8bd891 --- /dev/null +++ b/media/libjpeg/simd/jsimd_i386.c @@ -0,0 +1,1091 @@ +/* + * jsimd_i386.c + * + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander. + * Copyright (C) 2015, Matthieu Darbois. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * 32-bit x86 architecture. + */ + +#define JPEG_INTERNALS +#include "../jinclude.h" +#include "../jpeglib.h" +#include "../jsimd.h" +#include "../jdct.h" +#include "../jsimddct.h" +#include "jsimd.h" + +/* + * In the PIC cases, we have no guarantee that constants will keep + * their alignment. This macro allows us to verify it at runtime. + */ +#define IS_ALIGNED(ptr, order) (((unsigned)ptr & ((1 << order) - 1)) == 0) + +#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */ + +static unsigned int simd_support = ~0; +static unsigned int simd_huffman = 1; + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ +LOCAL(void) +init_simd (void) +{ + char *env = NULL; + + if (simd_support != ~0U) + return; + + simd_support = jpeg_simd_cpu_support(); + + /* Force different settings through environment variables */ + env = getenv("JSIMD_FORCEMMX"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support &= JSIMD_MMX; + env = getenv("JSIMD_FORCE3DNOW"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support &= JSIMD_3DNOW|JSIMD_MMX; + env = getenv("JSIMD_FORCESSE"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support &= JSIMD_SSE|JSIMD_MMX; + env = getenv("JSIMD_FORCESSE2"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support &= JSIMD_SSE2; + env = getenv("JSIMD_FORCENONE"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = 0; + env = getenv("JSIMD_NOHUFFENC"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_huffman = 0; +} + +GLOBAL(int) +jsimd_can_rgb_ycc (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565 (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch(cinfo->in_color_space) { + case JCS_EXT_RGB: + sse2fct=jsimd_extrgb_ycc_convert_sse2; + mmxfct=jsimd_extrgb_ycc_convert_mmx; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + sse2fct=jsimd_extrgbx_ycc_convert_sse2; + mmxfct=jsimd_extrgbx_ycc_convert_mmx; + break; + case JCS_EXT_BGR: + sse2fct=jsimd_extbgr_ycc_convert_sse2; + mmxfct=jsimd_extbgr_ycc_convert_mmx; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + sse2fct=jsimd_extbgrx_ycc_convert_sse2; + mmxfct=jsimd_extbgrx_ycc_convert_mmx; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + sse2fct=jsimd_extxbgr_ycc_convert_sse2; + mmxfct=jsimd_extxbgr_ycc_convert_mmx; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + sse2fct=jsimd_extxrgb_ycc_convert_sse2; + mmxfct=jsimd_extxrgb_ycc_convert_mmx; + break; + default: + sse2fct=jsimd_rgb_ycc_convert_sse2; + mmxfct=jsimd_rgb_ycc_convert_mmx; + break; + } + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2)) + sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); + else if (simd_support & JSIMD_MMX) + mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch(cinfo->in_color_space) { + case JCS_EXT_RGB: + sse2fct=jsimd_extrgb_gray_convert_sse2; + mmxfct=jsimd_extrgb_gray_convert_mmx; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + sse2fct=jsimd_extrgbx_gray_convert_sse2; + mmxfct=jsimd_extrgbx_gray_convert_mmx; + break; + case JCS_EXT_BGR: + sse2fct=jsimd_extbgr_gray_convert_sse2; + mmxfct=jsimd_extbgr_gray_convert_mmx; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + sse2fct=jsimd_extbgrx_gray_convert_sse2; + mmxfct=jsimd_extbgrx_gray_convert_mmx; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + sse2fct=jsimd_extxbgr_gray_convert_sse2; + mmxfct=jsimd_extxbgr_gray_convert_mmx; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + sse2fct=jsimd_extxrgb_gray_convert_sse2; + mmxfct=jsimd_extxrgb_gray_convert_mmx; + break; + default: + sse2fct=jsimd_rgb_gray_convert_sse2; + mmxfct=jsimd_rgb_gray_convert_mmx; + break; + } + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2)) + sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); + else if (simd_support & JSIMD_MMX) + mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch(cinfo->out_color_space) { + case JCS_EXT_RGB: + sse2fct=jsimd_ycc_extrgb_convert_sse2; + mmxfct=jsimd_ycc_extrgb_convert_mmx; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + sse2fct=jsimd_ycc_extrgbx_convert_sse2; + mmxfct=jsimd_ycc_extrgbx_convert_mmx; + break; + case JCS_EXT_BGR: + sse2fct=jsimd_ycc_extbgr_convert_sse2; + mmxfct=jsimd_ycc_extbgr_convert_mmx; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + sse2fct=jsimd_ycc_extbgrx_convert_sse2; + mmxfct=jsimd_ycc_extbgrx_convert_mmx; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + sse2fct=jsimd_ycc_extxbgr_convert_sse2; + mmxfct=jsimd_ycc_extxbgr_convert_mmx; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + sse2fct=jsimd_ycc_extxrgb_convert_sse2; + mmxfct=jsimd_ycc_extxrgb_convert_mmx; + break; + default: + sse2fct=jsimd_ycc_rgb_convert_sse2; + mmxfct=jsimd_ycc_rgb_convert_mmx; + break; + } + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2)) + sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); + else if (simd_support & JSIMD_MMX) + mmxfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_downsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + if (simd_support & JSIMD_SSE2) + jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); + else if (simd_support & JSIMD_MMX) + jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(void) +jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + if (simd_support & JSIMD_SSE2) + jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); + else if (simd_support & JSIMD_MMX) + jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(int) +jsimd_can_h2v2_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_SSE2) + jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); + else if (simd_support & JSIMD_MMX) + jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_SSE2) + jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); + else if (simd_support & JSIMD_MMX) + jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_fancy_upsample_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_fancy_upsample_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_fancy_upsample_sse2)) + jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); + else if (simd_support & JSIMD_MMX) + jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_fancy_upsample_sse2)) + jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); + else if (simd_support & JSIMD_MMX) + jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_merged_upsample_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_merged_upsample_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch(cinfo->out_color_space) { + case JCS_EXT_RGB: + sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2; + mmxfct=jsimd_h2v2_extrgb_merged_upsample_mmx; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + sse2fct=jsimd_h2v2_extrgbx_merged_upsample_sse2; + mmxfct=jsimd_h2v2_extrgbx_merged_upsample_mmx; + break; + case JCS_EXT_BGR: + sse2fct=jsimd_h2v2_extbgr_merged_upsample_sse2; + mmxfct=jsimd_h2v2_extbgr_merged_upsample_mmx; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + sse2fct=jsimd_h2v2_extbgrx_merged_upsample_sse2; + mmxfct=jsimd_h2v2_extbgrx_merged_upsample_mmx; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + sse2fct=jsimd_h2v2_extxbgr_merged_upsample_sse2; + mmxfct=jsimd_h2v2_extxbgr_merged_upsample_mmx; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + sse2fct=jsimd_h2v2_extxrgb_merged_upsample_sse2; + mmxfct=jsimd_h2v2_extxrgb_merged_upsample_mmx; + break; + default: + sse2fct=jsimd_h2v2_merged_upsample_sse2; + mmxfct=jsimd_h2v2_merged_upsample_mmx; + break; + } + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_merged_upsample_sse2)) + sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); + else if (simd_support & JSIMD_MMX) + mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch(cinfo->out_color_space) { + case JCS_EXT_RGB: + sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2; + mmxfct=jsimd_h2v1_extrgb_merged_upsample_mmx; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + sse2fct=jsimd_h2v1_extrgbx_merged_upsample_sse2; + mmxfct=jsimd_h2v1_extrgbx_merged_upsample_mmx; + break; + case JCS_EXT_BGR: + sse2fct=jsimd_h2v1_extbgr_merged_upsample_sse2; + mmxfct=jsimd_h2v1_extbgr_merged_upsample_mmx; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + sse2fct=jsimd_h2v1_extbgrx_merged_upsample_sse2; + mmxfct=jsimd_h2v1_extbgrx_merged_upsample_mmx; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + sse2fct=jsimd_h2v1_extxbgr_merged_upsample_sse2; + mmxfct=jsimd_h2v1_extxbgr_merged_upsample_mmx; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + sse2fct=jsimd_h2v1_extxrgb_merged_upsample_sse2; + mmxfct=jsimd_h2v1_extxrgb_merged_upsample_mmx; + break; + default: + sse2fct=jsimd_h2v1_merged_upsample_sse2; + mmxfct=jsimd_h2v1_merged_upsample_mmx; + break; + } + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_merged_upsample_sse2)) + sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); + else if (simd_support & JSIMD_MMX) + mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(int) +jsimd_can_convsamp (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + if (simd_support & JSIMD_SSE) + return 1; + if (simd_support & JSIMD_3DNOW) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + if (simd_support & JSIMD_SSE2) + jsimd_convsamp_sse2(sample_data, start_col, workspace); + else if (simd_support & JSIMD_MMX) + jsimd_convsamp_mmx(sample_data, start_col, workspace); +} + +GLOBAL(void) +jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ + if (simd_support & JSIMD_SSE2) + jsimd_convsamp_float_sse2(sample_data, start_col, workspace); + else if (simd_support & JSIMD_SSE) + jsimd_convsamp_float_sse(sample_data, start_col, workspace); + else if (simd_support & JSIMD_3DNOW) + jsimd_convsamp_float_3dnow(sample_data, start_col, workspace); +} + +GLOBAL(int) +jsimd_can_fdct_islow (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + + if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse)) + return 1; + if (simd_support & JSIMD_3DNOW) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow (DCTELEM *data) +{ + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2)) + jsimd_fdct_islow_sse2(data); + else if (simd_support & JSIMD_MMX) + jsimd_fdct_islow_mmx(data); +} + +GLOBAL(void) +jsimd_fdct_ifast (DCTELEM *data) +{ + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2)) + jsimd_fdct_ifast_sse2(data); + else if (simd_support & JSIMD_MMX) + jsimd_fdct_ifast_mmx(data); +} + +GLOBAL(void) +jsimd_fdct_float (FAST_FLOAT *data) +{ + if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse)) + jsimd_fdct_float_sse(data); + else if (simd_support & JSIMD_3DNOW) + jsimd_fdct_float_3dnow(data); +} + +GLOBAL(int) +jsimd_can_quantize (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + if (simd_support & JSIMD_SSE) + return 1; + if (simd_support & JSIMD_3DNOW) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors, + DCTELEM *workspace) +{ + if (simd_support & JSIMD_SSE2) + jsimd_quantize_sse2(coef_block, divisors, workspace); + else if (simd_support & JSIMD_MMX) + jsimd_quantize_mmx(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ + if (simd_support & JSIMD_SSE2) + jsimd_quantize_float_sse2(coef_block, divisors, workspace); + else if (simd_support & JSIMD_SSE) + jsimd_quantize_float_sse(coef_block, divisors, workspace); + else if (simd_support & JSIMD_3DNOW) + jsimd_quantize_float_3dnow(coef_block, divisors, workspace); +} + +GLOBAL(int) +jsimd_can_idct_2x2 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) + jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, + output_col); + else if (simd_support & JSIMD_MMX) + jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(void) +jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) + jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, + output_col); + else if (simd_support & JSIMD_MMX) + jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(int) +jsimd_can_idct_islow (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(IFAST_MULT_TYPE) != 2) + return 0; + if (IFAST_SCALE_BITS != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2)) + return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float (void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + if (sizeof(FLOAT_MULT_TYPE) != 4) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2)) + return 1; + if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse)) + return 1; + if (simd_support & JSIMD_3DNOW) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2)) + jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf, + output_col); + else if (simd_support & JSIMD_MMX) + jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2)) + jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, + output_col); + else if (simd_support & JSIMD_MMX) + jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2)) + jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf, + output_col); + else if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse)) + jsimd_idct_float_sse(compptr->dct_table, coef_block, output_buf, + output_col); + else if (simd_support & JSIMD_3DNOW) + jsimd_idct_float_3dnow(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block (void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && simd_huffman && + IS_ALIGNED_SSE(jconst_huff_encode_one_block)) + return 1; + + return 0; +} + +GLOBAL(JOCTET*) +jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val, + dctbl, actbl); +} diff --git a/media/libjpeg/simd/jsimd_mips.c b/media/libjpeg/simd/jsimd_mips.c new file mode 100644 index 000000000..63b8115d1 --- /dev/null +++ b/media/libjpeg/simd/jsimd_mips.c @@ -0,0 +1,1138 @@ +/* + * jsimd_mips.c + * + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2009-2011, 2014, 2016, D. R. Commander. + * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. + * Copyright (C) 2015, Matthieu Darbois. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * MIPS architecture. + */ + +#define JPEG_INTERNALS +#include "../jinclude.h" +#include "../jpeglib.h" +#include "../jsimd.h" +#include "../jdct.h" +#include "../jsimddct.h" +#include "jsimd.h" + +#include <stdio.h> +#include <string.h> +#include <ctype.h> + +static unsigned int simd_support = ~0; + +#if defined(__linux__) + +LOCAL(int) +parse_proc_cpuinfo(const char* search_string) +{ + const char* file_name = "/proc/cpuinfo"; + char cpuinfo_line[256]; + FILE* f = NULL; + simd_support = 0; + + if ((f = fopen(file_name, "r")) != NULL) { + while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) { + if (strstr(cpuinfo_line, search_string) != NULL) { + fclose(f); + simd_support |= JSIMD_MIPS_DSPR2; + return 1; + } + } + fclose(f); + } + /* Did not find string in the proc file, or not Linux ELF. */ + return 0; +} + +#endif + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ +LOCAL(void) +init_simd (void) +{ + if (simd_support != ~0U) + return; + + simd_support = 0; + +#if defined(__MIPSEL__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2) + simd_support |= JSIMD_MIPS_DSPR2; +#elif defined(__linux__) + /* We still have a chance to use MIPS DSPR2 regardless of globally used + * -mdspr2 options passed to gcc by performing runtime detection via + * /proc/cpuinfo parsing on linux */ + if (!parse_proc_cpuinfo("MIPS 74K")) + return; +#endif + + /* Force different settings through environment variables */ + env = getenv("JSIMD_FORCEDSPR2"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = JSIMD_MIPS_DSPR2; + env = getenv("JSIMD_FORCENONE"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = 0; +} + +static const int mips_idct_ifast_coefs[4] = { + 0x45404540, // FIX( 1.082392200 / 2) = 17734 = 0x4546 + 0x5A805A80, // FIX( 1.414213562 / 2) = 23170 = 0x5A82 + 0x76407640, // FIX( 1.847759065 / 2) = 30274 = 0x7642 + 0xAC60AC60 // FIX(-2.613125930 / 4) = -21407 = 0xAC61 +}; + +/* The following struct is borrowed from jdsample.c */ +typedef void (*upsample1_ptr) (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + +typedef struct { + struct jpeg_upsampler pub; + JSAMPARRAY color_buf[MAX_COMPONENTS]; + upsample1_ptr methods[MAX_COMPONENTS]; + int next_row_out; + JDIMENSION rows_to_go; + int rowgroup_height[MAX_COMPONENTS]; + UINT8 h_expand[MAX_COMPONENTS]; + UINT8 v_expand[MAX_COMPONENTS]; +} my_upsampler; + +typedef my_upsampler *my_upsample_ptr; + +GLOBAL(int) +jsimd_can_rgb_ycc (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565 (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_c_can_null_convert (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + void (*mipsdspr2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch(cinfo->in_color_space) { + case JCS_EXT_RGB: + mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + mipsdspr2fct=jsimd_extrgbx_ycc_convert_mips_dspr2; + break; + case JCS_EXT_BGR: + mipsdspr2fct=jsimd_extbgr_ycc_convert_mips_dspr2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + mipsdspr2fct=jsimd_extbgrx_ycc_convert_mips_dspr2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + mipsdspr2fct=jsimd_extxbgr_ycc_convert_mips_dspr2; + + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + mipsdspr2fct=jsimd_extxrgb_ycc_convert_mips_dspr2; + break; + default: + mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2; + break; + } + + if (simd_support & JSIMD_MIPS_DSPR2) + mipsdspr2fct(cinfo->image_width, input_buf, output_buf, output_row, + num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + void (*mipsdspr2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch(cinfo->in_color_space) { + case JCS_EXT_RGB: + mipsdspr2fct=jsimd_extrgb_gray_convert_mips_dspr2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + mipsdspr2fct=jsimd_extrgbx_gray_convert_mips_dspr2; + break; + case JCS_EXT_BGR: + mipsdspr2fct=jsimd_extbgr_gray_convert_mips_dspr2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + mipsdspr2fct=jsimd_extbgrx_gray_convert_mips_dspr2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + mipsdspr2fct=jsimd_extxbgr_gray_convert_mips_dspr2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + mipsdspr2fct=jsimd_extxrgb_gray_convert_mips_dspr2; + break; + default: + mipsdspr2fct=jsimd_extrgb_gray_convert_mips_dspr2; + break; + } + + if (simd_support & JSIMD_MIPS_DSPR2) + mipsdspr2fct(cinfo->image_width, input_buf, output_buf, output_row, + num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch(cinfo->out_color_space) { + case JCS_EXT_RGB: + mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + mipsdspr2fct=jsimd_ycc_extrgbx_convert_mips_dspr2; + break; + case JCS_EXT_BGR: + mipsdspr2fct=jsimd_ycc_extbgr_convert_mips_dspr2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + mipsdspr2fct=jsimd_ycc_extbgrx_convert_mips_dspr2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + mipsdspr2fct=jsimd_ycc_extxbgr_convert_mips_dspr2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + mipsdspr2fct=jsimd_ycc_extxrgb_convert_mips_dspr2; + break; + default: + mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2; + break; + } + + if (simd_support & JSIMD_MIPS_DSPR2) + mipsdspr2fct(cinfo->output_width, input_buf, input_row, output_buf, + num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ +} + +GLOBAL(void) +jsimd_c_null_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_c_null_convert_mips_dspr2(cinfo->image_width, input_buf, + output_buf, output_row, num_rows, + cinfo->num_components); +} + +GLOBAL(int) +jsimd_can_h2v2_downsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v2_smooth_downsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if(DCTSIZE != 8) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_h2v2_downsample_mips_dspr2(cinfo->image_width, + cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, input_data, + output_data); +} + +GLOBAL(void) +jsimd_h2v2_smooth_downsample (j_compress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v2_smooth_downsample_mips_dspr2(input_data, output_data, + compptr->v_samp_factor, + cinfo->max_v_samp_factor, + cinfo->smoothing_factor, + compptr->width_in_blocks, + cinfo->image_width); +} + +GLOBAL(void) +jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_h2v1_downsample_mips_dspr2(cinfo->image_width, + cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(int) +jsimd_can_h2v2_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_int_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_h2v2_upsample_mips_dspr2(cinfo->max_v_samp_factor, + cinfo->output_width, input_data, + output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_h2v1_upsample_mips_dspr2(cinfo->max_v_samp_factor, + cinfo->output_width, input_data, + output_data_ptr); +} + +GLOBAL(void) +jsimd_int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample; + + jsimd_int_upsample_mips_dspr2(upsample->h_expand[compptr->component_index], + upsample->v_expand[compptr->component_index], + input_data, output_data_ptr, + cinfo->output_width, + cinfo->max_v_samp_factor); +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_h2v2_fancy_upsample_mips_dspr2(cinfo->max_v_samp_factor, + compptr->downsampled_width, + input_data, output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_h2v1_fancy_upsample_mips_dspr2(cinfo->max_v_samp_factor, + compptr->downsampled_width, + input_data, output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample (void) +{ + init_simd(); + + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample (void) +{ + init_simd(); + + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, + JSAMPLE *); + + switch(cinfo->out_color_space) { + case JCS_EXT_RGB: + mipsdspr2fct=jsimd_h2v2_extrgb_merged_upsample_mips_dspr2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + mipsdspr2fct=jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2; + break; + case JCS_EXT_BGR: + mipsdspr2fct=jsimd_h2v2_extbgr_merged_upsample_mips_dspr2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + mipsdspr2fct=jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + mipsdspr2fct=jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + mipsdspr2fct=jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2; + break; + default: + mipsdspr2fct=jsimd_h2v2_extrgb_merged_upsample_mips_dspr2; + break; + } + + mipsdspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf, + cinfo->sample_range_limit); +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, + JSAMPLE *); + + switch(cinfo->out_color_space) { + case JCS_EXT_RGB: + mipsdspr2fct=jsimd_h2v1_extrgb_merged_upsample_mips_dspr2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + mipsdspr2fct=jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2; + break; + case JCS_EXT_BGR: + mipsdspr2fct=jsimd_h2v1_extbgr_merged_upsample_mips_dspr2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + mipsdspr2fct=jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + mipsdspr2fct=jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + mipsdspr2fct=jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2; + break; + default: + mipsdspr2fct=jsimd_h2v1_extrgb_merged_upsample_mips_dspr2; + break; + } + + mipsdspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf, + cinfo->sample_range_limit); +} + +GLOBAL(int) +jsimd_can_convsamp (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_convsamp_mips_dspr2(sample_data, start_col, workspace); +} + +GLOBAL(void) +jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ + if ((simd_support & JSIMD_MIPS_DSPR2)) + jsimd_convsamp_float_mips_dspr2(sample_data, start_col, workspace); +} + +GLOBAL(int) +jsimd_can_fdct_islow (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow (DCTELEM *data) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_fdct_islow_mips_dspr2(data); +} + +GLOBAL(void) +jsimd_fdct_ifast (DCTELEM *data) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_fdct_ifast_mips_dspr2(data); +} + +GLOBAL(void) +jsimd_fdct_float (FAST_FLOAT *data) +{ +} + +GLOBAL(int) +jsimd_can_quantize (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors, + DCTELEM *workspace) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_quantize_mips_dspr2(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_quantize_float_mips_dspr2(coef_block, divisors, workspace); +} + +GLOBAL(int) +jsimd_can_idct_2x2 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_6x6 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_12x12 (void) +{ + init_simd(); + + if (BITS_IN_JSAMPLE != 8) + return 0; + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_idct_2x2_mips_dspr2(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if (simd_support & JSIMD_MIPS_DSPR2) { + int workspace[DCTSIZE*4]; /* buffers data between passes */ + jsimd_idct_4x4_mips_dspr2(compptr->dct_table, coef_block, output_buf, + output_col, workspace); + } +} + +GLOBAL(void) +jsimd_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if (simd_support & JSIMD_MIPS_DSPR2) + jsimd_idct_6x6_mips_dspr2(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + if (simd_support & JSIMD_MIPS_DSPR2) { + int workspace[96]; + int output[12] = { + (int)(output_buf[0] + output_col), + (int)(output_buf[1] + output_col), + (int)(output_buf[2] + output_col), + (int)(output_buf[3] + output_col), + (int)(output_buf[4] + output_col), + (int)(output_buf[5] + output_col), + (int)(output_buf[6] + output_col), + (int)(output_buf[7] + output_col), + (int)(output_buf[8] + output_col), + (int)(output_buf[9] + output_col), + (int)(output_buf[10] + output_col), + (int)(output_buf[11] + output_col), + }; + jsimd_idct_12x12_pass1_mips_dspr2(coef_block, compptr->dct_table, + workspace); + jsimd_idct_12x12_pass2_mips_dspr2(workspace, output); + } +} + +GLOBAL(int) +jsimd_can_idct_islow (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(IFAST_MULT_TYPE) != 2) + return 0; + if (IFAST_SCALE_BITS != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if (simd_support & JSIMD_MIPS_DSPR2) { + int output[8] = { + (int)(output_buf[0] + output_col), + (int)(output_buf[1] + output_col), + (int)(output_buf[2] + output_col), + (int)(output_buf[3] + output_col), + (int)(output_buf[4] + output_col), + (int)(output_buf[5] + output_col), + (int)(output_buf[6] + output_col), + (int)(output_buf[7] + output_col), + }; + + jsimd_idct_islow_mips_dspr2(coef_block, compptr->dct_table, + output, IDCT_range_limit(cinfo)); + } +} + +GLOBAL(void) +jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if (simd_support & JSIMD_MIPS_DSPR2) { + JCOEFPTR inptr; + IFAST_MULT_TYPE *quantptr; + DCTELEM workspace[DCTSIZE2]; /* buffers data between passes */ + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (IFAST_MULT_TYPE *) compptr->dct_table; + + jsimd_idct_ifast_cols_mips_dspr2(inptr, quantptr, + workspace, mips_idct_ifast_coefs); + + /* Pass 2: process rows from work array, store into output array. */ + /* Note that we must descale the results by a factor of 8 == 2**3, */ + /* and also undo the PASS1_BITS scaling. */ + + jsimd_idct_ifast_rows_mips_dspr2(workspace, output_buf, + output_col, mips_idct_ifast_coefs); + } +} + +GLOBAL(void) +jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block (void) +{ + return 0; +} + +GLOBAL(JOCTET*) +jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + return NULL; +} diff --git a/media/libjpeg/simd/jsimd_mips_dspr2.S b/media/libjpeg/simd/jsimd_mips_dspr2.S new file mode 100644 index 000000000..c8c286cb3 --- /dev/null +++ b/media/libjpeg/simd/jsimd_mips_dspr2.S @@ -0,0 +1,4487 @@ +/* + * MIPS DSPr2 optimizations for libjpeg-turbo + * + * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. + * All Rights Reserved. + * Authors: Teodora Novkovic (teodora.novkovic@imgtec.com) + * Darko Laus (darko.laus@imgtec.com) + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#include "jsimd_mips_dspr2_asm.h" + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_c_null_convert_mips_dspr2) +/* + * a0 - cinfo->image_width + * a1 - input_buf + * a2 - output_buf + * a3 - output_row + * 16(sp) - num_rows + * 20(sp) - cinfo->num_components + * + * Null conversion for compression + */ + + SAVE_REGS_ON_STACK 8, s0, s1 + + lw t9, 24(sp) // t9 = num_rows + lw s0, 28(sp) // s0 = cinfo->num_components + andi t0, a0, 3 // t0 = cinfo->image_width & 3 + beqz t0, 4f // no residual + nop +0: + addiu t9, t9, -1 + bltz t9, 7f + li t1, 0 +1: + sll t3, t1, 2 + lwx t5, t3(a2) // t5 = outptr = output_buf[ci] + lw t2, 0(a1) // t2 = inptr = *input_buf + sll t4, a3, 2 + lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row] + addu t2, t2, t1 + addu s1, t5, a0 + addu t6, t5, t0 +2: + lbu t3, 0(t2) + addiu t5, t5, 1 + sb t3, -1(t5) + bne t6, t5, 2b + addu t2, t2, s0 +3: + lbu t3, 0(t2) + addu t4, t2, s0 + addu t7, t4, s0 + addu t8, t7, s0 + addu t2, t8, s0 + lbu t4, 0(t4) + lbu t7, 0(t7) + lbu t8, 0(t8) + addiu t5, t5, 4 + sb t3, -4(t5) + sb t4, -3(t5) + sb t7, -2(t5) + bne s1, t5, 3b + sb t8, -1(t5) + addiu t1, t1, 1 + bne t1, s0, 1b + nop + addiu a1, a1, 4 + bgez t9, 0b + addiu a3, a3, 1 + b 7f + nop +4: + addiu t9, t9, -1 + bltz t9, 7f + li t1, 0 +5: + sll t3, t1, 2 + lwx t5, t3(a2) // t5 = outptr = output_buf[ci] + lw t2, 0(a1) // t2 = inptr = *input_buf + sll t4, a3, 2 + lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row] + addu t2, t2, t1 + addu s1, t5, a0 + addu t6, t5, t0 +6: + lbu t3, 0(t2) + addu t4, t2, s0 + addu t7, t4, s0 + addu t8, t7, s0 + addu t2, t8, s0 + lbu t4, 0(t4) + lbu t7, 0(t7) + lbu t8, 0(t8) + addiu t5, t5, 4 + sb t3, -4(t5) + sb t4, -3(t5) + sb t7, -2(t5) + bne s1, t5, 6b + sb t8, -1(t5) + addiu t1, t1, 1 + bne t1, s0, 5b + nop + addiu a1, a1, 4 + bgez t9, 4b + addiu a3, a3, 1 +7: + RESTORE_REGS_FROM_STACK 8, s0, s1 + + j ra + nop + +END(jsimd_c_null_convert_mips_dspr2) + +/*****************************************************************************/ +/* + * jsimd_extrgb_ycc_convert_mips_dspr2 + * jsimd_extbgr_ycc_convert_mips_dspr2 + * jsimd_extrgbx_ycc_convert_mips_dspr2 + * jsimd_extbgrx_ycc_convert_mips_dspr2 + * jsimd_extxbgr_ycc_convert_mips_dspr2 + * jsimd_extxrgb_ycc_convert_mips_dspr2 + * + * Colorspace conversion RGB -> YCbCr + */ + +.macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs + +.macro DO_RGB_TO_YCC r, \ + g, \ + b, \ + inptr + lbu \r, \r_offs(\inptr) + lbu \g, \g_offs(\inptr) + lbu \b, \b_offs(\inptr) + addiu \inptr, \pixel_size +.endm + +LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2) +/* + * a0 - cinfo->image_width + * a1 - input_buf + * a2 - output_buf + * a3 - output_row + * 16(sp) - num_rows + */ + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + lw t7, 48(sp) // t7 = num_rows + li s0, 0x4c8b // FIX(0.29900) + li s1, 0x9646 // FIX(0.58700) + li s2, 0x1d2f // FIX(0.11400) + li s3, 0xffffd4cd // -FIX(0.16874) + li s4, 0xffffab33 // -FIX(0.33126) + li s5, 0x8000 // FIX(0.50000) + li s6, 0xffff94d1 // -FIX(0.41869) + li s7, 0xffffeb2f // -FIX(0.08131) + li t8, 0x807fff // CBCR_OFFSET + ONE_HALF-1 + +0: + addiu t7, -1 // --num_rows + lw t6, 0(a1) // t6 = input_buf[0] + lw t0, 0(a2) + lw t1, 4(a2) + lw t2, 8(a2) + sll t3, a3, 2 + lwx t0, t3(t0) // t0 = output_buf[0][output_row] + lwx t1, t3(t1) // t1 = output_buf[1][output_row] + lwx t2, t3(t2) // t2 = output_buf[2][output_row] + + addu t9, t2, a0 // t9 = end address + addiu a3, 1 + +1: + DO_RGB_TO_YCC t3, t4, t5, t6 + + mtlo s5, $ac0 + mtlo t8, $ac1 + mtlo t8, $ac2 + maddu $ac0, s2, t5 + maddu $ac1, s5, t5 + maddu $ac2, s5, t3 + maddu $ac0, s0, t3 + maddu $ac1, s3, t3 + maddu $ac2, s6, t4 + maddu $ac0, s1, t4 + maddu $ac1, s4, t4 + maddu $ac2, s7, t5 + extr.w t3, $ac0, 16 + extr.w t4, $ac1, 16 + extr.w t5, $ac2, 16 + sb t3, 0(t0) + sb t4, 0(t1) + sb t5, 0(t2) + addiu t0, 1 + addiu t2, 1 + bne t2, t9, 1b + addiu t1, 1 + bgtz t7, 0b + addiu a1, 4 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop +END(jsimd_\colorid\()_ycc_convert_mips_dspr2) + +.purgem DO_RGB_TO_YCC + +.endm + +/*------------------------------------------id -- pix R G B */ +GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2 +GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0 +GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2 +GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0 +GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1 +GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3 + +/*****************************************************************************/ +/* + * jsimd_ycc_extrgb_convert_mips_dspr2 + * jsimd_ycc_extbgr_convert_mips_dspr2 + * jsimd_ycc_extrgbx_convert_mips_dspr2 + * jsimd_ycc_extbgrx_convert_mips_dspr2 + * jsimd_ycc_extxbgr_convert_mips_dspr2 + * jsimd_ycc_extxrgb_convert_mips_dspr2 + * + * Colorspace conversion YCbCr -> RGB + */ + +.macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs, a_offs + +.macro STORE_YCC_TO_RGB scratch0 \ + scratch1 \ + scratch2 \ + outptr + sb \scratch0, \r_offs(\outptr) + sb \scratch1, \g_offs(\outptr) + sb \scratch2, \b_offs(\outptr) +.if (\pixel_size == 4) + li t0, 0xFF + sb t0, \a_offs(\outptr) +.endif + addiu \outptr, \pixel_size +.endm + +LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2) +/* + * a0 - cinfo->image_width + * a1 - input_buf + * a2 - input_row + * a3 - output_buf + * 16(sp) - num_rows + */ + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + lw s1, 48(sp) + li t3, 0x8000 + li t4, 0x166e9 // FIX(1.40200) + li t5, 0x1c5a2 // FIX(1.77200) + li t6, 0xffff492e // -FIX(0.71414) + li t7, 0xffffa7e6 // -FIX(0.34414) + repl.ph t8, 128 + +0: + lw s0, 0(a3) + lw t0, 0(a1) + lw t1, 4(a1) + lw t2, 8(a1) + sll s5, a2, 2 + addiu s1, -1 + lwx s2, s5(t0) + lwx s3, s5(t1) + lwx s4, s5(t2) + addu t9, s2, a0 + addiu a2, 1 + +1: + lbu s7, 0(s4) // cr + lbu s6, 0(s3) // cb + lbu s5, 0(s2) // y + addiu s2, 1 + addiu s4, 1 + addiu s7, -128 + addiu s6, -128 + mul t2, t7, s6 + mul t0, t6, s7 // Crgtab[cr] + sll s7, 15 + mulq_rs.w t1, t4, s7 // Crrtab[cr] + sll s6, 15 + addu t2, t3 // Cbgtab[cb] + addu t2, t0 + + mulq_rs.w t0, t5, s6 // Cbbtab[cb] + sra t2, 16 + addu t1, s5 + addu t2, s5 // add y + ins t2, t1, 16, 16 + subu.ph t2, t2, t8 + addu t0, s5 + shll_s.ph t2, t2, 8 + subu t0, 128 + shra.ph t2, t2, 8 + shll_s.w t0, t0, 24 + addu.ph t2, t2, t8 // clip & store + sra t0, t0, 24 + sra t1, t2, 16 + addiu t0, 128 + + STORE_YCC_TO_RGB t1, t2, t0, s0 + + bne s2, t9, 1b + addiu s3, 1 + bgtz s1, 0b + addiu a3, 4 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop +END(jsimd_ycc_\colorid\()_convert_mips_dspr2) + +.purgem STORE_YCC_TO_RGB + +.endm + +/*------------------------------------------id -- pix R G B A */ +GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2, 3 +GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0, 3 +GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3 +GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3 +GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0 +GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0 + +/*****************************************************************************/ +/* + * jsimd_extrgb_gray_convert_mips_dspr2 + * jsimd_extbgr_gray_convert_mips_dspr2 + * jsimd_extrgbx_gray_convert_mips_dspr2 + * jsimd_extbgrx_gray_convert_mips_dspr2 + * jsimd_extxbgr_gray_convert_mips_dspr2 + * jsimd_extxrgb_gray_convert_mips_dspr2 + * + * Colorspace conversion RGB -> GRAY + */ + +.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs + +.macro DO_RGB_TO_GRAY r, \ + g, \ + b, \ + inptr + lbu \r, \r_offs(\inptr) + lbu \g, \g_offs(\inptr) + lbu \b, \b_offs(\inptr) + addiu \inptr, \pixel_size +.endm + +LEAF_MIPS_DSPR2(jsimd_\colorid\()_gray_convert_mips_dspr2) +/* + * a0 - cinfo->image_width + * a1 - input_buf + * a2 - output_buf + * a3 - output_row + * 16(sp) - num_rows + */ + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + li s0, 0x4c8b // s0 = FIX(0.29900) + li s1, 0x9646 // s1 = FIX(0.58700) + li s2, 0x1d2f // s2 = FIX(0.11400) + li s7, 0x8000 // s7 = FIX(0.50000) + lw s6, 48(sp) + andi t7, a0, 3 + +0: + addiu s6, -1 // s6 = num_rows + lw t0, 0(a1) + lw t1, 0(a2) + sll t3, a3, 2 + lwx t1, t3(t1) + addiu a3, 1 + addu t9, t1, a0 + subu t8, t9, t7 + beq t1, t8, 2f + nop + +1: + DO_RGB_TO_GRAY t3, t4, t5, t0 + DO_RGB_TO_GRAY s3, s4, s5, t0 + + mtlo s7, $ac0 + maddu $ac0, s2, t5 + maddu $ac0, s1, t4 + maddu $ac0, s0, t3 + mtlo s7, $ac1 + maddu $ac1, s2, s5 + maddu $ac1, s1, s4 + maddu $ac1, s0, s3 + extr.w t6, $ac0, 16 + + DO_RGB_TO_GRAY t3, t4, t5, t0 + DO_RGB_TO_GRAY s3, s4, s5, t0 + + mtlo s7, $ac0 + maddu $ac0, s2, t5 + maddu $ac0, s1, t4 + extr.w t2, $ac1, 16 + maddu $ac0, s0, t3 + mtlo s7, $ac1 + maddu $ac1, s2, s5 + maddu $ac1, s1, s4 + maddu $ac1, s0, s3 + extr.w t5, $ac0, 16 + sb t6, 0(t1) + sb t2, 1(t1) + extr.w t3, $ac1, 16 + addiu t1, 4 + sb t5, -2(t1) + sb t3, -1(t1) + bne t1, t8, 1b + nop + +2: + beqz t7, 4f + nop + +3: + DO_RGB_TO_GRAY t3, t4, t5, t0 + + mtlo s7, $ac0 + maddu $ac0, s2, t5 + maddu $ac0, s1, t4 + maddu $ac0, s0, t3 + extr.w t6, $ac0, 16 + sb t6, 0(t1) + addiu t1, 1 + bne t1, t9, 3b + nop + +4: + bgtz s6, 0b + addiu a1, 4 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop +END(jsimd_\colorid\()_gray_convert_mips_dspr2) + +.purgem DO_RGB_TO_GRAY + +.endm + +/*------------------------------------------id -- pix R G B */ +GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2 +GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0 +GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2 +GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0 +GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1 +GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3 +/*****************************************************************************/ +/* + * jsimd_h2v2_merged_upsample_mips_dspr2 + * jsimd_h2v2_extrgb_merged_upsample_mips_dspr2 + * jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2 + * jsimd_h2v2_extbgr_merged_upsample_mips_dspr2 + * jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2 + * jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2 + * jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2 + * + * Merged h2v2 upsample routines + */ +.macro GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 colorid, \ + pixel_size, \ + r1_offs, \ + g1_offs, \ + b1_offs, \ + a1_offs, \ + r2_offs, \ + g2_offs, \ + b2_offs, \ + a2_offs + +.macro STORE_H2V2_2_PIXELS scratch0 \ + scratch1 \ + scratch2 \ + scratch3 \ + scratch4 \ + scratch5 \ + outptr + sb \scratch0, \r1_offs(\outptr) + sb \scratch1, \g1_offs(\outptr) + sb \scratch2, \b1_offs(\outptr) + sb \scratch3, \r2_offs(\outptr) + sb \scratch4, \g2_offs(\outptr) + sb \scratch5, \b2_offs(\outptr) +.if (\pixel_size == 8) + li \scratch0, 0xFF + sb \scratch0, \a1_offs(\outptr) + sb \scratch0, \a2_offs(\outptr) +.endif + addiu \outptr, \pixel_size +.endm + +.macro STORE_H2V2_1_PIXEL scratch0 \ + scratch1 \ + scratch2 \ + outptr + sb \scratch0, \r1_offs(\outptr) + sb \scratch1, \g1_offs(\outptr) + sb \scratch2, \b1_offs(\outptr) + +.if (\pixel_size == 8) + li t0, 0xFF + sb t0, \a1_offs(\outptr) +.endif +.endm + +LEAF_MIPS_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2) +/* + * a0 - cinfo->output_width + * a1 - input_buf + * a2 - in_row_group_ctr + * a3 - output_buf + * 16(sp) - cinfo->sample_range_limit + */ + + SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra + + lw t9, 56(sp) // cinfo->sample_range_limit + lw v0, 0(a1) + lw v1, 4(a1) + lw t0, 8(a1) + sll t1, a2, 3 + addiu t2, t1, 4 + sll t3, a2, 2 + lw t4, 0(a3) // t4 = output_buf[0] + lwx t1, t1(v0) // t1 = input_buf[0][in_row_group_ctr*2] + lwx t2, t2(v0) // t2 = input_buf[0][in_row_group_ctr*2 + 1] + lwx t5, t3(v1) // t5 = input_buf[1][in_row_group_ctr] + lwx t6, t3(t0) // t6 = input_buf[2][in_row_group_ctr] + lw t7, 4(a3) // t7 = output_buf[1] + li s1, 0xe6ea + addiu t8, s1, 0x7fff // t8 = 0x166e9 [FIX(1.40200)] + addiu s0, t8, 0x5eb9 // s0 = 0x1c5a2 [FIX(1.77200)] + addiu s1, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)] + xori s2, s1, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)] + srl t3, a0, 1 + blez t3, 2f + addu t0, t5, t3 // t0 = end address + 1: + lbu t3, 0(t5) + lbu s3, 0(t6) + addiu t5, t5, 1 + addiu t3, t3, -128 // (cb - 128) + addiu s3, s3, -128 // (cr - 128) + mult $ac1, s1, t3 + madd $ac1, s2, s3 + sll s3, s3, 15 + sll t3, t3, 15 + mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS + extr_r.w s5, $ac1, 16 + mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS + lbu v0, 0(t1) + addiu t6, t6, 1 + addiu t1, t1, 2 + addu t3, v0, s4 // y+cred + addu s3, v0, s5 // y+cgreen + addu v1, v0, s6 // y+cblue + addu t3, t9, t3 // y+cred + addu s3, t9, s3 // y+cgreen + addu v1, t9, v1 // y+cblue + lbu AT, 0(t3) + lbu s7, 0(s3) + lbu ra, 0(v1) + lbu v0, -1(t1) + addu t3, v0, s4 // y+cred + addu s3, v0, s5 // y+cgreen + addu v1, v0, s6 // y+cblue + addu t3, t9, t3 // y+cred + addu s3, t9, s3 // y+cgreen + addu v1, t9, v1 // y+cblue + lbu t3, 0(t3) + lbu s3, 0(s3) + lbu v1, 0(v1) + lbu v0, 0(t2) + + STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4 + + addu t3, v0, s4 // y+cred + addu s3, v0, s5 // y+cgreen + addu v1, v0, s6 // y+cblue + addu t3, t9, t3 // y+cred + addu s3, t9, s3 // y+cgreen + addu v1, t9, v1 // y+cblue + lbu AT, 0(t3) + lbu s7, 0(s3) + lbu ra, 0(v1) + lbu v0, 1(t2) + addiu t2, t2, 2 + addu t3, v0, s4 // y+cred + addu s3, v0, s5 // y+cgreen + addu v1, v0, s6 // y+cblue + addu t3, t9, t3 // y+cred + addu s3, t9, s3 // y+cgreen + addu v1, t9, v1 // y+cblue + lbu t3, 0(t3) + lbu s3, 0(s3) + lbu v1, 0(v1) + + STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7 + + bne t0, t5, 1b + nop +2: + andi t0, a0, 1 + beqz t0, 4f + lbu t3, 0(t5) + lbu s3, 0(t6) + addiu t3, t3, -128 // (cb - 128) + addiu s3, s3, -128 // (cr - 128) + mult $ac1, s1, t3 + madd $ac1, s2, s3 + sll s3, s3, 15 + sll t3, t3, 15 + lbu v0, 0(t1) + extr_r.w s5, $ac1, 16 + mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS + mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS + addu t3, v0, s4 // y+cred + addu s3, v0, s5 // y+cgreen + addu v1, v0, s6 // y+cblue + addu t3, t9, t3 // y+cred + addu s3, t9, s3 // y+cgreen + addu v1, t9, v1 // y+cblue + lbu t3, 0(t3) + lbu s3, 0(s3) + lbu v1, 0(v1) + lbu v0, 0(t2) + + STORE_H2V2_1_PIXEL t3, s3, v1, t4 + + addu t3, v0, s4 // y+cred + addu s3, v0, s5 // y+cgreen + addu v1, v0, s6 // y+cblue + addu t3, t9, t3 // y+cred + addu s3, t9, s3 // y+cgreen + addu v1, t9, v1 // y+cblue + lbu t3, 0(t3) + lbu s3, 0(s3) + lbu v1, 0(v1) + + STORE_H2V2_1_PIXEL t3, s3, v1, t7 +4: + RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra + + j ra + nop + +END(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2) + +.purgem STORE_H2V2_1_PIXEL +.purgem STORE_H2V2_2_PIXELS +.endm + +/*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */ +GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6 +GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6 +GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7 +GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7 +GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4 +GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4 +/*****************************************************************************/ +/* + * jsimd_h2v1_merged_upsample_mips_dspr2 + * jsimd_h2v1_extrgb_merged_upsample_mips_dspr2 + * jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2 + * jsimd_h2v1_extbgr_merged_upsample_mips_dspr2 + * jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2 + * jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2 + * jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2 + * + * Merged h2v1 upsample routines + */ + +.macro GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 colorid, \ + pixel_size, \ + r1_offs, \ + g1_offs, \ + b1_offs, \ + a1_offs, \ + r2_offs, \ + g2_offs, \ + b2_offs, \ + a2_offs + +.macro STORE_H2V1_2_PIXELS scratch0 \ + scratch1 \ + scratch2 \ + scratch3 \ + scratch4 \ + scratch5 \ + outptr + sb \scratch0, \r1_offs(\outptr) + sb \scratch1, \g1_offs(\outptr) + sb \scratch2, \b1_offs(\outptr) + sb \scratch3, \r2_offs(\outptr) + sb \scratch4, \g2_offs(\outptr) + sb \scratch5, \b2_offs(\outptr) +.if (\pixel_size == 8) + li t0, 0xFF + sb t0, \a1_offs(\outptr) + sb t0, \a2_offs(\outptr) +.endif + addiu \outptr, \pixel_size +.endm + +.macro STORE_H2V1_1_PIXEL scratch0 \ + scratch1 \ + scratch2 \ + outptr + sb \scratch0, \r1_offs(\outptr) + sb \scratch1, \g1_offs(\outptr) + sb \scratch2, \b1_offs(\outptr) +.if (\pixel_size == 8) + li t0, 0xFF + sb t0, \a1_offs(\outptr) +.endif +.endm + +LEAF_MIPS_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2) +/* + * a0 - cinfo->output_width + * a1 - input_buf + * a2 - in_row_group_ctr + * a3 - output_buf + * 16(sp) - range_limit + */ + + SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra + + li t0, 0xe6ea + lw t1, 0(a1) // t1 = input_buf[0] + lw t2, 4(a1) // t2 = input_buf[1] + lw t3, 8(a1) // t3 = input_buf[2] + lw t8, 56(sp) // t8 = range_limit + addiu s1, t0, 0x7fff // s1 = 0x166e9 [FIX(1.40200)] + addiu s2, s1, 0x5eb9 // s2 = 0x1c5a2 [FIX(1.77200)] + addiu s0, t0, 0x9916 // s0 = 0x8000 + addiu s4, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)] + xori s3, s4, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)] + srl t0, a0, 1 + sll t4, a2, 2 + lwx s5, t4(t1) // s5 = inptr0 + lwx s6, t4(t2) // s6 = inptr1 + lwx s7, t4(t3) // s7 = inptr2 + lw t7, 0(a3) // t7 = outptr + blez t0, 2f + addu t9, s6, t0 // t9 = end address +1: + lbu t2, 0(s6) // t2 = cb + lbu t0, 0(s7) // t0 = cr + lbu t1, 0(s5) // t1 = y + addiu t2, t2, -128 // t2 = cb - 128 + addiu t0, t0, -128 // t0 = cr - 128 + mult $ac1, s4, t2 + madd $ac1, s3, t0 + sll t0, t0, 15 + sll t2, t2, 15 + mulq_rs.w t0, s1, t0 // t0 = (C1*cr + ONE_HALF)>> SCALEBITS + extr_r.w t5, $ac1, 16 + mulq_rs.w t6, s2, t2 // t6 = (C2*cb + ONE_HALF)>> SCALEBITS + addiu s7, s7, 1 + addiu s6, s6, 1 + addu t2, t1, t0 // t2 = y + cred + addu t3, t1, t5 // t3 = y + cgreen + addu t4, t1, t6 // t4 = y + cblue + addu t2, t8, t2 + addu t3, t8, t3 + addu t4, t8, t4 + lbu t1, 1(s5) + lbu v0, 0(t2) + lbu v1, 0(t3) + lbu ra, 0(t4) + addu t2, t1, t0 + addu t3, t1, t5 + addu t4, t1, t6 + addu t2, t8, t2 + addu t3, t8, t3 + addu t4, t8, t4 + lbu t2, 0(t2) + lbu t3, 0(t3) + lbu t4, 0(t4) + + STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7 + + bne t9, s6, 1b + addiu s5, s5, 2 +2: + andi t0, a0, 1 + beqz t0, 4f + nop +3: + lbu t2, 0(s6) + lbu t0, 0(s7) + lbu t1, 0(s5) + addiu t2, t2, -128 //(cb - 128) + addiu t0, t0, -128 //(cr - 128) + mul t3, s4, t2 + mul t4, s3, t0 + sll t0, t0, 15 + sll t2, t2, 15 + mulq_rs.w t0, s1, t0 // (C1*cr + ONE_HALF)>> SCALEBITS + mulq_rs.w t6, s2, t2 // (C2*cb + ONE_HALF)>> SCALEBITS + addu t3, t3, s0 + addu t3, t4, t3 + sra t5, t3, 16 // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS + addu t2, t1, t0 // y + cred + addu t3, t1, t5 // y + cgreen + addu t4, t1, t6 // y + cblue + addu t2, t8, t2 + addu t3, t8, t3 + addu t4, t8, t4 + lbu t2, 0(t2) + lbu t3, 0(t3) + lbu t4, 0(t4) + + STORE_H2V1_1_PIXEL t2, t3, t4, t7 +4: + RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra + + j ra + nop + +END(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2) + +.purgem STORE_H2V1_1_PIXEL +.purgem STORE_H2V1_2_PIXELS +.endm + +/*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */ +GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6 +GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6 +GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7 +GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7 +GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4 +GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4 +/*****************************************************************************/ +/* + * jsimd_h2v2_fancy_upsample_mips_dspr2 + * + * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. + */ +LEAF_MIPS_DSPR2(jsimd_h2v2_fancy_upsample_mips_dspr2) +/* + * a0 - cinfo->max_v_samp_factor + * a1 - downsampled_width + * a2 - input_data + * a3 - output_data_ptr + */ + + SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 + + li s4, 0 + lw s2, 0(a3) // s2 = *output_data_ptr +0: + li t9, 2 + lw s1, -4(a2) // s1 = inptr1 + +1: + lw s0, 0(a2) // s0 = inptr0 + lwx s3, s4(s2) + addiu s5, a1, -2 // s5 = downsampled_width - 2 + srl t4, s5, 1 + sll t4, t4, 1 + lbu t0, 0(s0) + lbu t1, 1(s0) + lbu t2, 0(s1) + lbu t3, 1(s1) + addiu s0, 2 + addiu s1, 2 + addu t8, s0, t4 // t8 = end address + andi s5, s5, 1 // s5 = residual + sll t4, t0, 1 + sll t6, t1, 1 + addu t0, t0, t4 // t0 = (*inptr0++) * 3 + addu t1, t1, t6 // t1 = (*inptr0++) * 3 + addu t7, t0, t2 // t7 = thiscolsum + addu t6, t1, t3 // t5 = nextcolsum + sll t0, t7, 2 // t0 = thiscolsum * 4 + subu t1, t0, t7 // t1 = thiscolsum * 3 + shra_r.w t0, t0, 4 + addiu t1, 7 + addu t1, t1, t6 + srl t1, t1, 4 + sb t0, 0(s3) + sb t1, 1(s3) + beq t8, s0, 22f // skip to final iteration if width == 3 + addiu s3, 2 +2: + lh t0, 0(s0) // t0 = A3|A2 + lh t2, 0(s1) // t2 = B3|B2 + addiu s0, 2 + addiu s1, 2 + preceu.ph.qbr t0, t0 // t0 = 0|A3|0|A2 + preceu.ph.qbr t2, t2 // t2 = 0|B3|0|B2 + shll.ph t1, t0, 1 + sll t3, t6, 1 + addu.ph t0, t1, t0 // t0 = A3*3|A2*3 + addu t3, t3, t6 // t3 = this * 3 + addu.ph t0, t0, t2 // t0 = next2|next1 + addu t1, t3, t7 + andi t7, t0, 0xFFFF // t7 = next1 + sll t2, t7, 1 + addu t2, t7, t2 // t2 = next1*3 + addu t4, t2, t6 + srl t6, t0, 16 // t6 = next2 + shra_r.w t1, t1, 4 // t1 = (this*3 + last + 8) >> 4 + addu t0, t3, t7 + addiu t0, 7 + srl t0, t0, 4 // t0 = (this*3 + next1 + 7) >> 4 + shra_r.w t4, t4, 4 // t3 = (next1*3 + this + 8) >> 4 + addu t2, t2, t6 + addiu t2, 7 + srl t2, t2, 4 // t2 = (next1*3 + next2 + 7) >> 4 + sb t1, 0(s3) + sb t0, 1(s3) + sb t4, 2(s3) + sb t2, 3(s3) + bne t8, s0, 2b + addiu s3, 4 +22: + beqz s5, 4f + addu t8, s0, s5 +3: + lbu t0, 0(s0) + lbu t2, 0(s1) + addiu s0, 1 + addiu s1, 1 + sll t3, t6, 1 + sll t1, t0, 1 + addu t1, t0, t1 // t1 = inptr0 * 3 + addu t3, t3, t6 // t3 = thiscolsum * 3 + addu t5, t1, t2 + addu t1, t3, t7 + shra_r.w t1, t1, 4 + addu t0, t3, t5 + addiu t0, 7 + srl t0, t0, 4 + sb t1, 0(s3) + sb t0, 1(s3) + addiu s3, 2 + move t7, t6 + bne t8, s0, 3b + move t6, t5 +4: + sll t0, t6, 2 // t0 = thiscolsum * 4 + subu t1, t0, t6 // t1 = thiscolsum * 3 + addu t1, t1, t7 + addiu s4, 4 + shra_r.w t1, t1, 4 + addiu t0, 7 + srl t0, t0, 4 + sb t1, 0(s3) + sb t0, 1(s3) + addiu t9, -1 + addiu s3, 2 + bnez t9, 1b + lw s1, 4(a2) + srl t0, s4, 2 + subu t0, a0, t0 + bgtz t0, 0b + addiu a2, 4 + + RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5 + + j ra + nop +END(jsimd_h2v2_fancy_upsample_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_h2v1_fancy_upsample_mips_dspr2) +/* + * a0 - cinfo->max_v_samp_factor + * a1 - downsampled_width + * a2 - input_data + * a3 - output_data_ptr + */ + + SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 + + .set at + + beqz a0, 3f + sll t0, a0, 2 + lw s1, 0(a3) + li s3, 0x10001 + addu s0, s1, t0 +0: + addiu t8, a1, -2 + srl t9, t8, 2 + lw t7, 0(a2) + lw s2, 0(s1) + lbu t0, 0(t7) + lbu t1, 1(t7) // t1 = inptr[1] + sll t2, t0, 1 + addu t2, t2, t0 // t2 = invalue*3 + addu t2, t2, t1 + shra_r.w t2, t2, 2 + sb t0, 0(s2) + sb t2, 1(s2) + beqz t9, 11f + addiu s2, 2 +1: + ulw t0, 0(t7) // t0 = |P3|P2|P1|P0| + ulw t1, 1(t7) + ulh t2, 4(t7) // t2 = |0|0|P5|P4| + preceu.ph.qbl t3, t0 // t3 = |0|P3|0|P2| + preceu.ph.qbr t0, t0 // t0 = |0|P1|0|P0| + preceu.ph.qbr t2, t2 // t2 = |0|P5|0|P4| + preceu.ph.qbl t4, t1 // t4 = |0|P4|0|P3| + preceu.ph.qbr t1, t1 // t1 = |0|P2|0|P1| + shll.ph t5, t4, 1 + shll.ph t6, t1, 1 + addu.ph t5, t5, t4 // t5 = |P4*3|P3*3| + addu.ph t6, t6, t1 // t6 = |P2*3|P1*3| + addu.ph t4, t3, s3 + addu.ph t0, t0, s3 + addu.ph t4, t4, t5 + addu.ph t0, t0, t6 + shrl.ph t4, t4, 2 // t4 = |0|P3|0|P2| + shrl.ph t0, t0, 2 // t0 = |0|P1|0|P0| + addu.ph t2, t2, t5 + addu.ph t3, t3, t6 + shra_r.ph t2, t2, 2 // t2 = |0|P5|0|P4| + shra_r.ph t3, t3, 2 // t3 = |0|P3|0|P2| + shll.ph t2, t2, 8 + shll.ph t3, t3, 8 + or t2, t4, t2 + or t3, t3, t0 + addiu t9, -1 + usw t3, 0(s2) + usw t2, 4(s2) + addiu s2, 8 + bgtz t9, 1b + addiu t7, 4 +11: + andi t8, 3 + beqz t8, 22f + addiu t7, 1 + +2: + lbu t0, 0(t7) + addiu t7, 1 + sll t1, t0, 1 + addu t2, t0, t1 // t2 = invalue + lbu t3, -2(t7) + lbu t4, 0(t7) + addiu t3, 1 + addiu t4, 2 + addu t3, t3, t2 + addu t4, t4, t2 + srl t3, 2 + srl t4, 2 + sb t3, 0(s2) + sb t4, 1(s2) + addiu t8, -1 + bgtz t8, 2b + addiu s2, 2 + +22: + lbu t0, 0(t7) + lbu t2, -1(t7) + sll t1, t0, 1 + addu t1, t1, t0 // t1 = invalue * 3 + addu t1, t1, t2 + addiu t1, 1 + srl t1, t1, 2 + sb t1, 0(s2) + sb t0, 1(s2) + addiu s1, 4 + bne s1, s0, 0b + addiu a2, 4 +3: + RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 + + j ra + nop +END(jsimd_h2v1_fancy_upsample_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_h2v1_downsample_mips_dspr2) +/* + * a0 - cinfo->image_width + * a1 - cinfo->max_v_samp_factor + * a2 - compptr->v_samp_factor + * a3 - compptr->width_in_blocks + * 16(sp) - input_data + * 20(sp) - output_data + */ + .set at + + SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4 + + beqz a2, 7f + lw s1, 44(sp) // s1 = output_data + lw s0, 40(sp) // s0 = input_data + srl s2, a0, 2 + andi t9, a0, 2 + srl t7, t9, 1 + addu s2, t7, s2 + sll t0, a3, 3 // t0 = width_in_blocks*DCT + srl t7, t0, 1 + subu s2, t7, s2 +0: + andi t6, a0, 1 // t6 = temp_index + addiu t6, -1 + lw t4, 0(s1) // t4 = outptr + lw t5, 0(s0) // t5 = inptr0 + li s3, 0 // s3 = bias + srl t7, a0, 1 // t7 = image_width1 + srl s4, t7, 2 + andi t8, t7, 3 +1: + ulhu t0, 0(t5) + ulhu t1, 2(t5) + ulhu t2, 4(t5) + ulhu t3, 6(t5) + raddu.w.qb t0, t0 + raddu.w.qb t1, t1 + raddu.w.qb t2, t2 + raddu.w.qb t3, t3 + shra.ph t0, t0, 1 + shra_r.ph t1, t1, 1 + shra.ph t2, t2, 1 + shra_r.ph t3, t3, 1 + sb t0, 0(t4) + sb t1, 1(t4) + sb t2, 2(t4) + sb t3, 3(t4) + addiu s4, -1 + addiu t4, 4 + bgtz s4, 1b + addiu t5, 8 + beqz t8, 3f + addu s4, t4, t8 +2: + ulhu t0, 0(t5) + raddu.w.qb t0, t0 + addqh.w t0, t0, s3 + xori s3, s3, 1 + sb t0, 0(t4) + addiu t4, 1 + bne t4, s4, 2b + addiu t5, 2 +3: + lbux t1, t6(t5) + sll t1, 1 + addqh.w t2, t1, s3 // t2 = pixval1 + xori s3, s3, 1 + addqh.w t3, t1, s3 // t3 = pixval2 + blez s2, 5f + append t3, t2, 8 + addu t5, t4, s2 // t5 = loop_end2 +4: + ush t3, 0(t4) + addiu s2, -1 + bgtz s2, 4b + addiu t4, 2 +5: + beqz t9, 6f + nop + sb t2, 0(t4) +6: + addiu s1, 4 + addiu a2, -1 + bnez a2, 0b + addiu s0, 4 +7: + RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4 + + j ra + nop +END(jsimd_h2v1_downsample_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_h2v2_downsample_mips_dspr2) + +/* + * a0 - cinfo->image_width + * a1 - cinfo->max_v_samp_factor + * a2 - compptr->v_samp_factor + * a3 - compptr->width_in_blocks + * 16(sp) - input_data + * 20(sp) - output_data + */ + .set at + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + beqz a2, 8f + lw s1, 52(sp) // s1 = output_data + lw s0, 48(sp) // s0 = input_data + + andi t6, a0, 1 // t6 = temp_index + addiu t6, -1 + srl t7, a0, 1 // t7 = image_width1 + srl s4, t7, 2 + andi t8, t7, 3 + andi t9, a0, 2 + srl s2, a0, 2 + srl t7, t9, 1 + addu s2, t7, s2 + sll t0, a3, 3 // s2 = width_in_blocks*DCT + srl t7, t0, 1 + subu s2, t7, s2 +0: + lw t4, 0(s1) // t4 = outptr + lw t5, 0(s0) // t5 = inptr0 + lw s7, 4(s0) // s7 = inptr1 + li s6, 1 // s6 = bias +2: + ulw t0, 0(t5) // t0 = |P3|P2|P1|P0| + ulw t1, 0(s7) // t1 = |Q3|Q2|Q1|Q0| + ulw t2, 4(t5) + ulw t3, 4(s7) + precrq.ph.w t7, t0, t1 // t2 = |P3|P2|Q3|Q2| + ins t0, t1, 16, 16 // t0 = |Q1|Q0|P1|P0| + raddu.w.qb t1, t7 + raddu.w.qb t0, t0 + shra_r.w t1, t1, 2 + addiu t0, 1 + srl t0, 2 + precrq.ph.w t7, t2, t3 + ins t2, t3, 16, 16 + raddu.w.qb t7, t7 + raddu.w.qb t2, t2 + shra_r.w t7, t7, 2 + addiu t2, 1 + srl t2, 2 + sb t0, 0(t4) + sb t1, 1(t4) + sb t2, 2(t4) + sb t7, 3(t4) + addiu t4, 4 + addiu t5, 8 + addiu s4, s4, -1 + bgtz s4, 2b + addiu s7, 8 + beqz t8, 4f + addu t8, t4, t8 +3: + ulhu t0, 0(t5) + ulhu t1, 0(s7) + ins t0, t1, 16, 16 + raddu.w.qb t0, t0 + addu t0, t0, s6 + srl t0, 2 + xori s6, s6, 3 + sb t0, 0(t4) + addiu t5, 2 + addiu t4, 1 + bne t8, t4, 3b + addiu s7, 2 +4: + lbux t1, t6(t5) + sll t1, 1 + lbux t0, t6(s7) + sll t0, 1 + addu t1, t1, t0 + addu t3, t1, s6 + srl t0, t3, 2 // t2 = pixval1 + xori s6, s6, 3 + addu t2, t1, s6 + srl t1, t2, 2 // t3 = pixval2 + blez s2, 6f + append t1, t0, 8 +5: + ush t1, 0(t4) + addiu s2, -1 + bgtz s2, 5b + addiu t4, 2 +6: + beqz t9, 7f + nop + sb t0, 0(t4) +7: + addiu s1, 4 + addiu a2, -1 + bnez a2, 0b + addiu s0, 8 +8: + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop +END(jsimd_h2v2_downsample_mips_dspr2) +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_h2v2_smooth_downsample_mips_dspr2) +/* + * a0 - input_data + * a1 - output_data + * a2 - compptr->v_samp_factor + * a3 - cinfo->max_v_samp_factor + * 16(sp) - cinfo->smoothing_factor + * 20(sp) - compptr->width_in_blocks + * 24(sp) - cinfo->image_width + */ + + .set at + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + lw s7, 52(sp) // compptr->width_in_blocks + lw s0, 56(sp) // cinfo->image_width + lw s6, 48(sp) // cinfo->smoothing_factor + sll s7, 3 // output_cols = width_in_blocks * DCTSIZE + sll v0, s7, 1 + subu v0, v0, s0 + blez v0, 2f + move v1, zero + addiu t0, a3, 2 // t0 = cinfo->max_v_samp_factor + 2 +0: + addiu t1, a0, -4 + sll t2, v1, 2 + lwx t1, t2(t1) + move t3, v0 + addu t1, t1, s0 + lbu t2, -1(t1) +1: + addiu t3, t3, -1 + sb t2, 0(t1) + bgtz t3, 1b + addiu t1, t1, 1 + addiu v1, v1, 1 + bne v1, t0, 0b + nop +2: + li v0, 80 + mul v0, s6, v0 + li v1, 16384 + move t4, zero + move t5, zero + subu t6, v1, v0 // t6 = 16384 - tmp_smoot_f * 80 + sll t7, s6, 4 // t7 = tmp_smoot_f * 16 +3: +/* Special case for first column: pretend column -1 is same as column 0 */ + sll v0, t4, 2 + lwx t8, v0(a1) // outptr = output_data[outrow] + sll v1, t5, 2 + addiu t9, v1, 4 + addiu s0, v1, -4 + addiu s1, v1, 8 + lwx s2, v1(a0) // inptr0 = input_data[inrow] + lwx t9, t9(a0) // inptr1 = input_data[inrow+1] + lwx s0, s0(a0) // above_ptr = input_data[inrow-1] + lwx s1, s1(a0) // below_ptr = input_data[inrow+2] + lh v0, 0(s2) + lh v1, 0(t9) + lh t0, 0(s0) + lh t1, 0(s1) + ins v0, v1, 16, 16 + ins t0, t1, 16, 16 + raddu.w.qb t2, v0 + raddu.w.qb s3, t0 + lbu v0, 0(s2) + lbu v1, 2(s2) + lbu t0, 0(t9) + lbu t1, 2(t9) + addu v0, v0, v1 + mult $ac1,t2, t6 + addu t0, t0, t1 + lbu t2, 2(s0) + addu t0, t0, v0 + lbu t3, 2(s1) + addu s3, t0, s3 + lbu v0, 0(s0) + lbu t0, 0(s1) + sll s3, s3, 1 + addu v0, v0, t2 + addu t0, t0, t3 + addu t0, t0, v0 + addu s3, t0, s3 + madd $ac1,s3, t7 + extr_r.w v0, $ac1, 16 + addiu t8, t8, 1 + addiu s2, s2, 2 + addiu t9, t9, 2 + addiu s0, s0, 2 + addiu s1, s1, 2 + sb v0, -1(t8) + addiu s4, s7, -2 + and s4, s4, 3 + addu s5, s4, t8 //end adress +4: + lh v0, 0(s2) + lh v1, 0(t9) + lh t0, 0(s0) + lh t1, 0(s1) + ins v0, v1, 16, 16 + ins t0, t1, 16, 16 + raddu.w.qb t2, v0 + raddu.w.qb s3, t0 + lbu v0, -1(s2) + lbu v1, 2(s2) + lbu t0, -1(t9) + lbu t1, 2(t9) + addu v0, v0, v1 + mult $ac1, t2, t6 + addu t0, t0, t1 + lbu t2, 2(s0) + addu t0, t0, v0 + lbu t3, 2(s1) + addu s3, t0, s3 + lbu v0, -1(s0) + lbu t0, -1(s1) + sll s3, s3, 1 + addu v0, v0, t2 + addu t0, t0, t3 + addu t0, t0, v0 + addu s3, t0, s3 + madd $ac1, s3, t7 + extr_r.w t2, $ac1, 16 + addiu t8, t8, 1 + addiu s2, s2, 2 + addiu t9, t9, 2 + addiu s0, s0, 2 + sb t2, -1(t8) + bne s5, t8, 4b + addiu s1, s1, 2 + addiu s5, s7, -2 + subu s5, s5, s4 + addu s5, s5, t8 //end adress +5: + lh v0, 0(s2) + lh v1, 0(t9) + lh t0, 0(s0) + lh t1, 0(s1) + ins v0, v1, 16, 16 + ins t0, t1, 16, 16 + raddu.w.qb t2, v0 + raddu.w.qb s3, t0 + lbu v0, -1(s2) + lbu v1, 2(s2) + lbu t0, -1(t9) + lbu t1, 2(t9) + addu v0, v0, v1 + mult $ac1, t2, t6 + addu t0, t0, t1 + lbu t2, 2(s0) + addu t0, t0, v0 + lbu t3, 2(s1) + addu s3, t0, s3 + lbu v0, -1(s0) + lbu t0, -1(s1) + sll s3, s3, 1 + addu v0, v0, t2 + addu t0, t0, t3 + lh v1, 2(t9) + addu t0, t0, v0 + lh v0, 2(s2) + addu s3, t0, s3 + lh t0, 2(s0) + lh t1, 2(s1) + madd $ac1, s3, t7 + extr_r.w t2, $ac1, 16 + ins t0, t1, 16, 16 + ins v0, v1, 16, 16 + raddu.w.qb s3, t0 + lbu v1, 4(s2) + lbu t0, 1(t9) + lbu t1, 4(t9) + sb t2, 0(t8) + raddu.w.qb t3, v0 + lbu v0, 1(s2) + addu t0, t0, t1 + mult $ac1, t3, t6 + addu v0, v0, v1 + lbu t2, 4(s0) + addu t0, t0, v0 + lbu v0, 1(s0) + addu s3, t0, s3 + lbu t0, 1(s1) + lbu t3, 4(s1) + addu v0, v0, t2 + sll s3, s3, 1 + addu t0, t0, t3 + lh v1, 4(t9) + addu t0, t0, v0 + lh v0, 4(s2) + addu s3, t0, s3 + lh t0, 4(s0) + lh t1, 4(s1) + madd $ac1, s3, t7 + extr_r.w t2, $ac1, 16 + ins t0, t1, 16, 16 + ins v0, v1, 16, 16 + raddu.w.qb s3, t0 + lbu v1, 6(s2) + lbu t0, 3(t9) + lbu t1, 6(t9) + sb t2, 1(t8) + raddu.w.qb t3, v0 + lbu v0, 3(s2) + addu t0, t0,t1 + mult $ac1, t3, t6 + addu v0, v0, v1 + lbu t2, 6(s0) + addu t0, t0, v0 + lbu v0, 3(s0) + addu s3, t0, s3 + lbu t0, 3(s1) + lbu t3, 6(s1) + addu v0, v0, t2 + sll s3, s3, 1 + addu t0, t0, t3 + lh v1, 6(t9) + addu t0, t0, v0 + lh v0, 6(s2) + addu s3, t0, s3 + lh t0, 6(s0) + lh t1, 6(s1) + madd $ac1, s3, t7 + extr_r.w t3, $ac1, 16 + ins t0, t1, 16, 16 + ins v0, v1, 16, 16 + raddu.w.qb s3, t0 + lbu v1, 8(s2) + lbu t0, 5(t9) + lbu t1, 8(t9) + sb t3, 2(t8) + raddu.w.qb t2, v0 + lbu v0, 5(s2) + addu t0, t0, t1 + mult $ac1, t2, t6 + addu v0, v0, v1 + lbu t2, 8(s0) + addu t0, t0, v0 + lbu v0, 5(s0) + addu s3, t0, s3 + lbu t0, 5(s1) + lbu t3, 8(s1) + addu v0, v0, t2 + sll s3, s3, 1 + addu t0, t0, t3 + addiu t8, t8, 4 + addu t0, t0, v0 + addiu s2, s2, 8 + addu s3, t0, s3 + addiu t9, t9, 8 + madd $ac1, s3, t7 + extr_r.w t1, $ac1, 16 + addiu s0, s0, 8 + addiu s1, s1, 8 + bne s5, t8, 5b + sb t1, -1(t8) +/* Special case for last column */ + lh v0, 0(s2) + lh v1, 0(t9) + lh t0, 0(s0) + lh t1, 0(s1) + ins v0, v1, 16, 16 + ins t0, t1, 16, 16 + raddu.w.qb t2, v0 + raddu.w.qb s3, t0 + lbu v0, -1(s2) + lbu v1, 1(s2) + lbu t0, -1(t9) + lbu t1, 1(t9) + addu v0, v0, v1 + mult $ac1, t2, t6 + addu t0, t0, t1 + lbu t2, 1(s0) + addu t0, t0, v0 + lbu t3, 1(s1) + addu s3, t0, s3 + lbu v0, -1(s0) + lbu t0, -1(s1) + sll s3, s3, 1 + addu v0, v0, t2 + addu t0, t0, t3 + addu t0, t0, v0 + addu s3, t0, s3 + madd $ac1, s3, t7 + extr_r.w t0, $ac1, 16 + addiu t5, t5, 2 + sb t0, 0(t8) + addiu t4, t4, 1 + bne t4, a2, 3b + addiu t5, t5, 2 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop + +END(jsimd_h2v2_smooth_downsample_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_int_upsample_mips_dspr2) +/* + * a0 - upsample->h_expand[compptr->component_index] + * a1 - upsample->v_expand[compptr->component_index] + * a2 - input_data + * a3 - output_data_ptr + * 16(sp) - cinfo->output_width + * 20(sp) - cinfo->max_v_samp_factor + */ + .set at + + SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 + + lw s0, 0(a3) // s0 = output_data + lw s1, 32(sp) // s1 = cinfo->output_width + lw s2, 36(sp) // s2 = cinfo->max_v_samp_factor + li t6, 0 // t6 = inrow + beqz s2, 10f + li s3, 0 // s3 = outrow +0: + addu t0, a2, t6 + addu t7, s0, s3 + lw t3, 0(t0) // t3 = inptr + lw t8, 0(t7) // t8 = outptr + beqz s1, 4f + addu t5, t8, s1 // t5 = outend +1: + lb t2, 0(t3) // t2 = invalue = *inptr++ + addiu t3, 1 + beqz a0, 3f + move t0, a0 // t0 = h_expand +2: + sb t2, 0(t8) + addiu t0, -1 + bgtz t0, 2b + addiu t8, 1 +3: + bgt t5, t8, 1b + nop +4: + addiu t9, a1, -1 // t9 = v_expand - 1 + blez t9, 9f + nop +5: + lw t3, 0(s0) + lw t4, 4(s0) + subu t0, s1, 0xF + blez t0, 7f + addu t5, t3, s1 // t5 = end address + andi t7, s1, 0xF // t7 = residual + subu t8, t5, t7 +6: + ulw t0, 0(t3) + ulw t1, 4(t3) + ulw t2, 8(t3) + usw t0, 0(t4) + ulw t0, 12(t3) + usw t1, 4(t4) + usw t2, 8(t4) + usw t0, 12(t4) + addiu t3, 16 + bne t3, t8, 6b + addiu t4, 16 + beqz t7, 8f + nop +7: + lbu t0, 0(t3) + sb t0, 0(t4) + addiu t3, 1 + bne t3, t5, 7b + addiu t4, 1 +8: + addiu t9, -1 + bgtz t9, 5b + addiu s0, 8 +9: + addu s3, s3, a1 + bne s3, s2, 0b + addiu t6, 1 +10: + RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 + + j ra + nop +END(jsimd_int_upsample_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_h2v1_upsample_mips_dspr2) +/* + * a0 - cinfo->max_v_samp_factor + * a1 - cinfo->output_width + * a2 - input_data + * a3 - output_data_ptr + */ + lw t7, 0(a3) // t7 = output_data + andi t8, a1, 0xf // t8 = residual + sll t0, a0, 2 + blez a0, 4f + addu t9, t7, t0 // t9 = output_data end address +0: + lw t5, 0(t7) // t5 = outptr + lw t6, 0(a2) // t6 = inptr + addu t3, t5, a1 // t3 = outptr + output_width (end address) + subu t3, t8 // t3 = end address - residual + beq t5, t3, 2f + move t4, t8 +1: + ulw t0, 0(t6) // t0 = |P3|P2|P1|P0| + ulw t2, 4(t6) // t2 = |P7|P6|P5|P4| + srl t1, t0, 16 // t1 = |X|X|P3|P2| + ins t0, t0, 16, 16 // t0 = |P1|P0|P1|P0| + ins t1, t1, 16, 16 // t1 = |P3|P2|P3|P2| + ins t0, t0, 8, 16 // t0 = |P1|P1|P0|P0| + ins t1, t1, 8, 16 // t1 = |P3|P3|P2|P2| + usw t0, 0(t5) + usw t1, 4(t5) + srl t0, t2, 16 // t0 = |X|X|P7|P6| + ins t2, t2, 16, 16 // t2 = |P5|P4|P5|P4| + ins t0, t0, 16, 16 // t0 = |P7|P6|P7|P6| + ins t2, t2, 8, 16 // t2 = |P5|P5|P4|P4| + ins t0, t0, 8, 16 // t0 = |P7|P7|P6|P6| + usw t2, 8(t5) + usw t0, 12(t5) + addiu t5, 16 + bne t5, t3, 1b + addiu t6, 8 + beqz t8, 3f + move t4, t8 +2: + lbu t1, 0(t6) + sb t1, 0(t5) + sb t1, 1(t5) + addiu t4, -2 + addiu t6, 1 + bgtz t4, 2b + addiu t5, 2 +3: + addiu t7, 4 + bne t9, t7, 0b + addiu a2, 4 +4: + j ra + nop +END(jsimd_h2v1_upsample_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2) +/* + * a0 - cinfo->max_v_samp_factor + * a1 - cinfo->output_width + * a2 - input_data + * a3 - output_data_ptr + */ + lw t7, 0(a3) + blez a0, 7f + andi t9, a1, 0xf // t9 = residual +0: + lw t6, 0(a2) // t6 = inptr + lw t5, 0(t7) // t5 = outptr + addu t8, t5, a1 // t8 = outptr end address + subu t8, t9 // t8 = end address - residual + beq t5, t8, 2f + move t4, t9 +1: + ulw t0, 0(t6) + srl t1, t0, 16 + ins t0, t0, 16, 16 + ins t0, t0, 8, 16 + ins t1, t1, 16, 16 + ins t1, t1, 8, 16 + ulw t2, 4(t6) + usw t0, 0(t5) + usw t1, 4(t5) + srl t3, t2, 16 + ins t2, t2, 16, 16 + ins t2, t2, 8, 16 + ins t3, t3, 16, 16 + ins t3, t3, 8, 16 + usw t2, 8(t5) + usw t3, 12(t5) + addiu t5, 16 + bne t5, t8, 1b + addiu t6, 8 + beqz t9, 3f + move t4, t9 +2: + lbu t0, 0(t6) + sb t0, 0(t5) + sb t0, 1(t5) + addiu t4, -2 + addiu t6, 1 + bgtz t4, 2b + addiu t5, 2 +3: + lw t6, 0(t7) // t6 = outptr[0] + lw t5, 4(t7) // t5 = outptr[1] + addu t4, t6, a1 // t4 = new end address + beq a1, t9, 5f + subu t8, t4, t9 +4: + ulw t0, 0(t6) + ulw t1, 4(t6) + ulw t2, 8(t6) + usw t0, 0(t5) + ulw t0, 12(t6) + usw t1, 4(t5) + usw t2, 8(t5) + usw t0, 12(t5) + addiu t6, 16 + bne t6, t8, 4b + addiu t5, 16 + beqz t9, 6f + nop +5: + lbu t0, 0(t6) + sb t0, 0(t5) + addiu t6, 1 + bne t6, t4, 5b + addiu t5, 1 +6: + addiu t7, 8 + addiu a0, -2 + bgtz a0, 0b + addiu a2, 4 +7: + j ra + nop +END(jsimd_h2v2_upsample_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_idct_islow_mips_dspr2) +/* + * a0 - coef_block + * a1 - compptr->dcttable + * a2 - output + * a3 - range_limit + */ + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + addiu sp, sp, -256 + move v0, sp + addiu v1, zero, 8 // v1 = DCTSIZE = 8 +1: + lh s4, 32(a0) // s4 = inptr[16] + lh s5, 64(a0) // s5 = inptr[32] + lh s6, 96(a0) // s6 = inptr[48] + lh t1, 112(a0) // t1 = inptr[56] + lh t7, 16(a0) // t7 = inptr[8] + lh t5, 80(a0) // t5 = inptr[40] + lh t3, 48(a0) // t3 = inptr[24] + or s4, s4, t1 + or s4, s4, t3 + or s4, s4, t5 + or s4, s4, t7 + or s4, s4, s5 + or s4, s4, s6 + bnez s4, 2f + addiu v1, v1, -1 + lh s5, 0(a1) // quantptr[DCTSIZE*0] + lh s6, 0(a0) // inptr[DCTSIZE*0] + mul s5, s5, s6 // DEQUANTIZE(inptr[0], quantptr[0]) + sll s5, s5, 2 + sw s5, 0(v0) + sw s5, 32(v0) + sw s5, 64(v0) + sw s5, 96(v0) + sw s5, 128(v0) + sw s5, 160(v0) + sw s5, 192(v0) + b 3f + sw s5, 224(v0) +2: + lh t0, 112(a1) + lh t2, 48(a1) + lh t4, 80(a1) + lh t6, 16(a1) + mul t0, t0, t1 // DEQUANTIZE(inptr[DCTSIZE*7],quant[DCTSIZE*7]) + mul t1, t2, t3 // DEQUANTIZE(inptr[DCTSIZE*3],quant[DCTSIZE*3]) + mul t2, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*5],quant[DCTSIZE*5]) + mul t3, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*1],quant[DCTSIZE*1]) + lh t4, 32(a1) + lh t5, 32(a0) + lh t6, 96(a1) + lh t7, 96(a0) + addu s0, t0, t1 // z3 = tmp0 + tmp2 + addu s1, t1, t2 // z2 = tmp1 + tmp2 + addu s2, t2, t3 // z4 = tmp1 + tmp3 + addu s3, s0, s2 // z3 + z4 + addiu t9, zero, 9633 // FIX_1_175875602 + mul s3, s3, t9 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602) + addu t8, t0, t3 // z1 = tmp0 + tmp3 + addiu t9, zero, 2446 // FIX_0_298631336 + mul t0, t0, t9 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336) + addiu t9, zero, 16819 // FIX_2_053119869 + mul t2, t2, t9 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869) + addiu t9, zero, 25172 // FIX_3_072711026 + mul t1, t1, t9 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026) + addiu t9, zero, 12299 // FIX_1_501321110 + mul t3, t3, t9 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110) + addiu t9, zero, 16069 // FIX_1_961570560 + mul s0, s0, t9 // -z3 = MULTIPLY(z3, FIX_1_961570560) + addiu t9, zero, 3196 // FIX_0_390180644 + mul s2, s2, t9 // -z4 = MULTIPLY(z4, FIX_0_390180644) + addiu t9, zero, 7373 // FIX_0_899976223 + mul t8, t8, t9 // -z1 = MULTIPLY(z1, FIX_0_899976223) + addiu t9, zero, 20995 // FIX_2_562915447 + mul s1, s1, t9 // -z2 = MULTIPLY(z2, FIX_2_562915447) + subu s0, s3, s0 // z3 += z5 + addu t0, t0, s0 // tmp0 += z3 + addu t1, t1, s0 // tmp2 += z3 + subu s2, s3, s2 // z4 += z5 + addu t2, t2, s2 // tmp1 += z4 + addu t3, t3, s2 // tmp3 += z4 + subu t0, t0, t8 // tmp0 += z1 + subu t1, t1, s1 // tmp2 += z2 + subu t2, t2, s1 // tmp1 += z2 + subu t3, t3, t8 // tmp3 += z1 + mul s0, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*2],quant[DCTSIZE*2]) + addiu t9, zero, 6270 // FIX_0_765366865 + mul s1, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*6],quant[DCTSIZE*6]) + lh t4, 0(a1) + lh t5, 0(a0) + lh t6, 64(a1) + lh t7, 64(a0) + mul s2, t9, s0 // MULTIPLY(z2, FIX_0_765366865) + mul t5, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*0],quant[DCTSIZE*0]) + mul t6, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*4],quant[DCTSIZE*4]) + addiu t9, zero, 4433 // FIX_0_541196100 + addu s3, s0, s1 // z2 + z3 + mul s3, s3, t9 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100) + addiu t9, zero, 15137 // FIX_1_847759065 + mul t8, s1, t9 // MULTIPLY(z3, FIX_1_847759065) + addu t4, t5, t6 + subu t5, t5, t6 + sll t4, t4, 13 // tmp0 = (z2 + z3) << CONST_BITS + sll t5, t5, 13 // tmp1 = (z2 - z3) << CONST_BITS + addu t7, s3, s2 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) + subu t6, s3, t8 // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065) + addu s0, t4, t7 + subu s1, t4, t7 + addu s2, t5, t6 + subu s3, t5, t6 + addu t4, s0, t3 + subu s0, s0, t3 + addu t3, s2, t1 + subu s2, s2, t1 + addu t1, s3, t2 + subu s3, s3, t2 + addu t2, s1, t0 + subu s1, s1, t0 + shra_r.w t4, t4, 11 + shra_r.w t3, t3, 11 + shra_r.w t1, t1, 11 + shra_r.w t2, t2, 11 + shra_r.w s1, s1, 11 + shra_r.w s3, s3, 11 + shra_r.w s2, s2, 11 + shra_r.w s0, s0, 11 + sw t4, 0(v0) + sw t3, 32(v0) + sw t1, 64(v0) + sw t2, 96(v0) + sw s1, 128(v0) + sw s3, 160(v0) + sw s2, 192(v0) + sw s0, 224(v0) +3: + addiu a1, a1, 2 + addiu a0, a0, 2 + bgtz v1, 1b + addiu v0, v0, 4 + move v0, sp + addiu v1, zero, 8 +4: + lw t0, 8(v0) // z2 = (JLONG) wsptr[2] + lw t1, 24(v0) // z3 = (JLONG) wsptr[6] + lw t2, 0(v0) // (JLONG) wsptr[0] + lw t3, 16(v0) // (JLONG) wsptr[4] + lw s4, 4(v0) // (JLONG) wsptr[1] + lw s5, 12(v0) // (JLONG) wsptr[3] + lw s6, 20(v0) // (JLONG) wsptr[5] + lw s7, 28(v0) // (JLONG) wsptr[7] + or s4, s4, t0 + or s4, s4, t1 + or s4, s4, t3 + or s4, s4, s7 + or s4, s4, s5 + or s4, s4, s6 + bnez s4, 5f + addiu v1, v1, -1 + shra_r.w s5, t2, 5 + andi s5, s5, 0x3ff + lbux s5, s5(a3) + lw s1, 0(a2) + replv.qb s5, s5 + usw s5, 0(s1) + usw s5, 4(s1) + b 6f + nop +5: + addu t4, t0, t1 // z2 + z3 + addiu t8, zero, 4433 // FIX_0_541196100 + mul t5, t4, t8 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100) + addiu t8, zero, 15137 // FIX_1_847759065 + mul t1, t1, t8 // MULTIPLY(z3, FIX_1_847759065) + addiu t8, zero, 6270 // FIX_0_765366865 + mul t0, t0, t8 // MULTIPLY(z2, FIX_0_765366865) + addu t4, t2, t3 // (JLONG) wsptr[0] + (JLONG) wsptr[4] + subu t2, t2, t3 // (JLONG) wsptr[0] - (JLONG) wsptr[4] + sll t4, t4, 13 // tmp0 = ((wsptr[0] + wsptr[4]) << CONST_BITS + sll t2, t2, 13 // tmp1 = ((wsptr[0] - wsptr[4]) << CONST_BITS + subu t1, t5, t1 // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065) + subu t3, t2, t1 // tmp12 = tmp1 - tmp2 + addu t2, t2, t1 // tmp11 = tmp1 + tmp2 + addu t5, t5, t0 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) + subu t1, t4, t5 // tmp13 = tmp0 - tmp3 + addu t0, t4, t5 // tmp10 = tmp0 + tmp3 + lw t4, 28(v0) // tmp0 = (JLONG) wsptr[7] + lw t6, 12(v0) // tmp2 = (JLONG) wsptr[3] + lw t5, 20(v0) // tmp1 = (JLONG) wsptr[5] + lw t7, 4(v0) // tmp3 = (JLONG) wsptr[1] + addu s0, t4, t6 // z3 = tmp0 + tmp2 + addiu t8, zero, 9633 // FIX_1_175875602 + addu s1, t5, t7 // z4 = tmp1 + tmp3 + addu s2, s0, s1 // z3 + z4 + mul s2, s2, t8 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602) + addu s3, t4, t7 // z1 = tmp0 + tmp3 + addu t9, t5, t6 // z2 = tmp1 + tmp2 + addiu t8, zero, 16069 // FIX_1_961570560 + mul s0, s0, t8 // -z3 = MULTIPLY(z3, FIX_1_961570560) + addiu t8, zero, 3196 // FIX_0_390180644 + mul s1, s1, t8 // -z4 = MULTIPLY(z4, FIX_0_390180644) + addiu t8, zero, 2446 // FIX_0_298631336 + mul t4, t4, t8 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336) + addiu t8, zero, 7373 // FIX_0_899976223 + mul s3, s3, t8 // -z1 = MULTIPLY(z1, FIX_0_899976223) + addiu t8, zero, 16819 // FIX_2_053119869 + mul t5, t5, t8 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869) + addiu t8, zero, 20995 // FIX_2_562915447 + mul t9, t9, t8 // -z2 = MULTIPLY(z2, FIX_2_562915447) + addiu t8, zero, 25172 // FIX_3_072711026 + mul t6, t6, t8 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026) + addiu t8, zero, 12299 // FIX_1_501321110 + mul t7, t7, t8 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110) + subu s0, s2, s0 // z3 += z5 + subu s1, s2, s1 // z4 += z5 + addu t4, t4, s0 + subu t4, t4, s3 // tmp0 + addu t5, t5, s1 + subu t5, t5, t9 // tmp1 + addu t6, t6, s0 + subu t6, t6, t9 // tmp2 + addu t7, t7, s1 + subu t7, t7, s3 // tmp3 + addu s0, t0, t7 + subu t0, t0, t7 + addu t7, t2, t6 + subu t2, t2, t6 + addu t6, t3, t5 + subu t3, t3, t5 + addu t5, t1, t4 + subu t1, t1, t4 + shra_r.w s0, s0, 18 + shra_r.w t7, t7, 18 + shra_r.w t6, t6, 18 + shra_r.w t5, t5, 18 + shra_r.w t1, t1, 18 + shra_r.w t3, t3, 18 + shra_r.w t2, t2, 18 + shra_r.w t0, t0, 18 + andi s0, s0, 0x3ff + andi t7, t7, 0x3ff + andi t6, t6, 0x3ff + andi t5, t5, 0x3ff + andi t1, t1, 0x3ff + andi t3, t3, 0x3ff + andi t2, t2, 0x3ff + andi t0, t0, 0x3ff + lw s1, 0(a2) + lbux s0, s0(a3) + lbux t7, t7(a3) + lbux t6, t6(a3) + lbux t5, t5(a3) + lbux t1, t1(a3) + lbux t3, t3(a3) + lbux t2, t2(a3) + lbux t0, t0(a3) + sb s0, 0(s1) + sb t7, 1(s1) + sb t6, 2(s1) + sb t5, 3(s1) + sb t1, 4(s1) + sb t3, 5(s1) + sb t2, 6(s1) + sb t0, 7(s1) +6: + addiu v0, v0, 32 + bgtz v1, 4b + addiu a2, a2, 4 + addiu sp, sp, 256 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop + +END(jsimd_idct_islow_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_idct_ifast_cols_mips_dspr2) +/* + * a0 - inptr + * a1 - quantptr + * a2 - wsptr + * a3 - mips_idct_ifast_coefs + */ + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + addiu t9, a0, 16 // end address + or AT, a3, zero + +0: + lw s0, 0(a1) // quantptr[DCTSIZE*0] + lw t0, 0(a0) // inptr[DCTSIZE*0] + lw t1, 16(a0) // inptr[DCTSIZE*1] + muleq_s.w.phl v0, t0, s0 // tmp0 ... + lw t2, 32(a0) // inptr[DCTSIZE*2] + lw t3, 48(a0) // inptr[DCTSIZE*3] + lw t4, 64(a0) // inptr[DCTSIZE*4] + lw t5, 80(a0) // inptr[DCTSIZE*5] + muleq_s.w.phr t0, t0, s0 // ... tmp0 ... + lw t6, 96(a0) // inptr[DCTSIZE*6] + lw t7, 112(a0) // inptr[DCTSIZE*7] + or s4, t1, t2 + or s5, t3, t4 + bnez s4, 1f + ins t0, v0, 16, 16 // ... tmp0 + bnez s5, 1f + or s6, t5, t6 + or s6, s6, t7 + bnez s6, 1f + sw t0, 0(a2) // wsptr[DCTSIZE*0] + sw t0, 16(a2) // wsptr[DCTSIZE*1] + sw t0, 32(a2) // wsptr[DCTSIZE*2] + sw t0, 48(a2) // wsptr[DCTSIZE*3] + sw t0, 64(a2) // wsptr[DCTSIZE*4] + sw t0, 80(a2) // wsptr[DCTSIZE*5] + sw t0, 96(a2) // wsptr[DCTSIZE*6] + sw t0, 112(a2) // wsptr[DCTSIZE*7] + addiu a0, a0, 4 + b 2f + addiu a1, a1, 4 + +1: + lw s1, 32(a1) // quantptr[DCTSIZE*2] + lw s2, 64(a1) // quantptr[DCTSIZE*4] + muleq_s.w.phl v0, t2, s1 // tmp1 ... + muleq_s.w.phr t2, t2, s1 // ... tmp1 ... + lw s0, 16(a1) // quantptr[DCTSIZE*1] + lw s1, 48(a1) // quantptr[DCTSIZE*3] + lw s3, 96(a1) // quantptr[DCTSIZE*6] + muleq_s.w.phl v1, t4, s2 // tmp2 ... + muleq_s.w.phr t4, t4, s2 // ... tmp2 ... + lw s2, 80(a1) // quantptr[DCTSIZE*5] + lw t8, 4(AT) // FIX(1.414213562) + ins t2, v0, 16, 16 // ... tmp1 + muleq_s.w.phl v0, t6, s3 // tmp3 ... + muleq_s.w.phr t6, t6, s3 // ... tmp3 ... + ins t4, v1, 16, 16 // ... tmp2 + addq.ph s4, t0, t4 // tmp10 + subq.ph s5, t0, t4 // tmp11 + ins t6, v0, 16, 16 // ... tmp3 + subq.ph s6, t2, t6 // tmp12 ... + addq.ph s7, t2, t6 // tmp13 + mulq_s.ph s6, s6, t8 // ... tmp12 ... + addq.ph t0, s4, s7 // tmp0 + subq.ph t6, s4, s7 // tmp3 + muleq_s.w.phl v0, t1, s0 // tmp4 ... + muleq_s.w.phr t1, t1, s0 // ... tmp4 ... + shll_s.ph s6, s6, 1 // x2 + lw s3, 112(a1) // quantptr[DCTSIZE*7] + subq.ph s6, s6, s7 // ... tmp12 + muleq_s.w.phl v1, t7, s3 // tmp7 ... + muleq_s.w.phr t7, t7, s3 // ... tmp7 ... + ins t1, v0, 16, 16 // ... tmp4 + addq.ph t2, s5, s6 // tmp1 + subq.ph t4, s5, s6 // tmp2 + muleq_s.w.phl v0, t5, s2 // tmp6 ... + muleq_s.w.phr t5, t5, s2 // ... tmp6 ... + ins t7, v1, 16, 16 // ... tmp7 + addq.ph s5, t1, t7 // z11 + subq.ph s6, t1, t7 // z12 + muleq_s.w.phl v1, t3, s1 // tmp5 ... + muleq_s.w.phr t3, t3, s1 // ... tmp5 ... + ins t5, v0, 16, 16 // ... tmp6 + ins t3, v1, 16, 16 // ... tmp5 + addq.ph s7, t5, t3 // z13 + subq.ph v0, t5, t3 // z10 + addq.ph t7, s5, s7 // tmp7 + subq.ph s5, s5, s7 // tmp11 ... + addq.ph v1, v0, s6 // z5 ... + mulq_s.ph s5, s5, t8 // ... tmp11 + lw t8, 8(AT) // FIX(1.847759065) + lw s4, 0(AT) // FIX(1.082392200) + addq.ph s0, t0, t7 + subq.ph s1, t0, t7 + mulq_s.ph v1, v1, t8 // ... z5 + shll_s.ph s5, s5, 1 // x2 + lw t8, 12(AT) // FIX(-2.613125930) + sw s0, 0(a2) // wsptr[DCTSIZE*0] + shll_s.ph v0, v0, 1 // x4 + mulq_s.ph v0, v0, t8 // tmp12 ... + mulq_s.ph s4, s6, s4 // tmp10 ... + shll_s.ph v1, v1, 1 // x2 + addiu a0, a0, 4 + addiu a1, a1, 4 + sw s1, 112(a2) // wsptr[DCTSIZE*7] + shll_s.ph s6, v0, 1 // x4 + shll_s.ph s4, s4, 1 // x2 + addq.ph s6, s6, v1 // ... tmp12 + subq.ph t5, s6, t7 // tmp6 + subq.ph s4, s4, v1 // ... tmp10 + subq.ph t3, s5, t5 // tmp5 + addq.ph s2, t2, t5 + addq.ph t1, s4, t3 // tmp4 + subq.ph s3, t2, t5 + sw s2, 16(a2) // wsptr[DCTSIZE*1] + sw s3, 96(a2) // wsptr[DCTSIZE*6] + addq.ph v0, t4, t3 + subq.ph v1, t4, t3 + sw v0, 32(a2) // wsptr[DCTSIZE*2] + sw v1, 80(a2) // wsptr[DCTSIZE*5] + addq.ph v0, t6, t1 + subq.ph v1, t6, t1 + sw v0, 64(a2) // wsptr[DCTSIZE*4] + sw v1, 48(a2) // wsptr[DCTSIZE*3] + +2: + bne a0, t9, 0b + addiu a2, a2, 4 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop + +END(jsimd_idct_ifast_cols_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_idct_ifast_rows_mips_dspr2) +/* + * a0 - wsptr + * a1 - output_buf + * a2 - output_col + * a3 - mips_idct_ifast_coefs + */ + + SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 + + addiu t9, a0, 128 // end address + lui s8, 0x8080 + ori s8, s8, 0x8080 + +0: + lw AT, 36(sp) // restore $a3 (mips_idct_ifast_coefs) + lw t0, 0(a0) // wsptr[DCTSIZE*0+0/1] b a + lw s0, 16(a0) // wsptr[DCTSIZE*1+0/1] B A + lw t2, 4(a0) // wsptr[DCTSIZE*0+2/3] d c + lw s2, 20(a0) // wsptr[DCTSIZE*1+2/3] D C + lw t4, 8(a0) // wsptr[DCTSIZE*0+4/5] f e + lw s4, 24(a0) // wsptr[DCTSIZE*1+4/5] F E + lw t6, 12(a0) // wsptr[DCTSIZE*0+6/7] h g + lw s6, 28(a0) // wsptr[DCTSIZE*1+6/7] H G + precrq.ph.w t1, s0, t0 // B b + ins t0, s0, 16, 16 // A a + bnez t1, 1f + or s0, t2, s2 + bnez s0, 1f + or s0, t4, s4 + bnez s0, 1f + or s0, t6, s6 + bnez s0, 1f + shll_s.ph s0, t0, 2 // A a + lw a3, 0(a1) + lw AT, 4(a1) + precrq.ph.w t0, s0, s0 // A A + ins s0, s0, 16, 16 // a a + addu a3, a3, a2 + addu AT, AT, a2 + precrq.qb.ph t0, t0, t0 // A A A A + precrq.qb.ph s0, s0, s0 // a a a a + addu.qb s0, s0, s8 + addu.qb t0, t0, s8 + sw s0, 0(a3) + sw s0, 4(a3) + sw t0, 0(AT) + sw t0, 4(AT) + addiu a0, a0, 32 + bne a0, t9, 0b + addiu a1, a1, 8 + b 2f + nop + +1: + precrq.ph.w t3, s2, t2 + ins t2, s2, 16, 16 + precrq.ph.w t5, s4, t4 + ins t4, s4, 16, 16 + precrq.ph.w t7, s6, t6 + ins t6, s6, 16, 16 + lw t8, 4(AT) // FIX(1.414213562) + addq.ph s4, t0, t4 // tmp10 + subq.ph s5, t0, t4 // tmp11 + subq.ph s6, t2, t6 // tmp12 ... + addq.ph s7, t2, t6 // tmp13 + mulq_s.ph s6, s6, t8 // ... tmp12 ... + addq.ph t0, s4, s7 // tmp0 + subq.ph t6, s4, s7 // tmp3 + shll_s.ph s6, s6, 1 // x2 + subq.ph s6, s6, s7 // ... tmp12 + addq.ph t2, s5, s6 // tmp1 + subq.ph t4, s5, s6 // tmp2 + addq.ph s5, t1, t7 // z11 + subq.ph s6, t1, t7 // z12 + addq.ph s7, t5, t3 // z13 + subq.ph v0, t5, t3 // z10 + addq.ph t7, s5, s7 // tmp7 + subq.ph s5, s5, s7 // tmp11 ... + addq.ph v1, v0, s6 // z5 ... + mulq_s.ph s5, s5, t8 // ... tmp11 + lw t8, 8(AT) // FIX(1.847759065) + lw s4, 0(AT) // FIX(1.082392200) + addq.ph s0, t0, t7 // tmp0 + tmp7 + subq.ph s7, t0, t7 // tmp0 - tmp7 + mulq_s.ph v1, v1, t8 // ... z5 + lw a3, 0(a1) + lw t8, 12(AT) // FIX(-2.613125930) + shll_s.ph s5, s5, 1 // x2 + addu a3, a3, a2 + shll_s.ph v0, v0, 1 // x4 + mulq_s.ph v0, v0, t8 // tmp12 ... + mulq_s.ph s4, s6, s4 // tmp10 ... + shll_s.ph v1, v1, 1 // x2 + addiu a0, a0, 32 + addiu a1, a1, 8 + shll_s.ph s6, v0, 1 // x4 + shll_s.ph s4, s4, 1 // x2 + addq.ph s6, s6, v1 // ... tmp12 + shll_s.ph s0, s0, 2 + subq.ph t5, s6, t7 // tmp6 + subq.ph s4, s4, v1 // ... tmp10 + subq.ph t3, s5, t5 // tmp5 + shll_s.ph s7, s7, 2 + addq.ph t1, s4, t3 // tmp4 + addq.ph s1, t2, t5 // tmp1 + tmp6 + subq.ph s6, t2, t5 // tmp1 - tmp6 + addq.ph s2, t4, t3 // tmp2 + tmp5 + subq.ph s5, t4, t3 // tmp2 - tmp5 + addq.ph s4, t6, t1 // tmp3 + tmp4 + subq.ph s3, t6, t1 // tmp3 - tmp4 + shll_s.ph s1, s1, 2 + shll_s.ph s2, s2, 2 + shll_s.ph s3, s3, 2 + shll_s.ph s4, s4, 2 + shll_s.ph s5, s5, 2 + shll_s.ph s6, s6, 2 + precrq.ph.w t0, s1, s0 // B A + ins s0, s1, 16, 16 // b a + precrq.ph.w t2, s3, s2 // D C + ins s2, s3, 16, 16 // d c + precrq.ph.w t4, s5, s4 // F E + ins s4, s5, 16, 16 // f e + precrq.ph.w t6, s7, s6 // H G + ins s6, s7, 16, 16 // h g + precrq.qb.ph t0, t2, t0 // D C B A + precrq.qb.ph s0, s2, s0 // d c b a + precrq.qb.ph t4, t6, t4 // H G F E + precrq.qb.ph s4, s6, s4 // h g f e + addu.qb s0, s0, s8 + addu.qb s4, s4, s8 + sw s0, 0(a3) // outptr[0/1/2/3] d c b a + sw s4, 4(a3) // outptr[4/5/6/7] h g f e + lw a3, -4(a1) + addu.qb t0, t0, s8 + addu a3, a3, a2 + addu.qb t4, t4, s8 + sw t0, 0(a3) // outptr[0/1/2/3] D C B A + bne a0, t9, 0b + sw t4, 4(a3) // outptr[4/5/6/7] H G F E + +2: + + RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 + + j ra + nop + +END(jsimd_idct_ifast_rows_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_fdct_islow_mips_dspr2) +/* + * a0 - data + */ + + SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8 + + lui t0, 6437 + ori t0, 2260 + lui t1, 9633 + ori t1, 11363 + lui t2, 0xd39e + ori t2, 0xe6dc + lui t3, 0xf72d + ori t3, 9633 + lui t4, 2261 + ori t4, 9633 + lui t5, 0xd39e + ori t5, 6437 + lui t6, 9633 + ori t6, 0xd39d + lui t7, 0xe6dc + ori t7, 2260 + lui t8, 4433 + ori t8, 10703 + lui t9, 0xd630 + ori t9, 4433 + li s8, 8 + move a1, a0 +1: + lw s0, 0(a1) // tmp0 = 1|0 + lw s1, 4(a1) // tmp1 = 3|2 + lw s2, 8(a1) // tmp2 = 5|4 + lw s3, 12(a1) // tmp3 = 7|6 + packrl.ph s1, s1, s1 // tmp1 = 2|3 + packrl.ph s3, s3, s3 // tmp3 = 6|7 + subq.ph s7, s1, s2 // tmp7 = 2-5|3-4 = t5|t4 + subq.ph s5, s0, s3 // tmp5 = 1-6|0-7 = t6|t7 + mult $0, $0 // ac0 = 0 + dpa.w.ph $ac0, s7, t0 // ac0 += t5* 6437 + t4* 2260 + dpa.w.ph $ac0, s5, t1 // ac0 += t6* 9633 + t7* 11363 + mult $ac1, $0, $0 // ac1 = 0 + dpa.w.ph $ac1, s7, t2 // ac1 += t5*-11362 + t4* -6436 + dpa.w.ph $ac1, s5, t3 // ac1 += t6* -2259 + t7* 9633 + mult $ac2, $0, $0 // ac2 = 0 + dpa.w.ph $ac2, s7, t4 // ac2 += t5* 2261 + t4* 9633 + dpa.w.ph $ac2, s5, t5 // ac2 += t6*-11362 + t7* 6437 + mult $ac3, $0, $0 // ac3 = 0 + dpa.w.ph $ac3, s7, t6 // ac3 += t5* 9633 + t4*-11363 + dpa.w.ph $ac3, s5, t7 // ac3 += t6* -6436 + t7* 2260 + addq.ph s6, s1, s2 // tmp6 = 2+5|3+4 = t2|t3 + addq.ph s4, s0, s3 // tmp4 = 1+6|0+7 = t1|t0 + extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11 + extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11 + extr_r.w s2, $ac2, 11 // tmp2 = (ac2 + 1024) >> 11 + extr_r.w s3, $ac3, 11 // tmp3 = (ac3 + 1024) >> 11 + addq.ph s5, s4, s6 // tmp5 = t1+t2|t0+t3 = t11|t10 + subq.ph s7, s4, s6 // tmp7 = t1-t2|t0-t3 = t12|t13 + sh s0, 2(a1) + sh s1, 6(a1) + sh s2, 10(a1) + sh s3, 14(a1) + mult $0, $0 // ac0 = 0 + dpa.w.ph $ac0, s7, t8 // ac0 += t12* 4433 + t13* 10703 + mult $ac1, $0, $0 // ac1 = 0 + dpa.w.ph $ac1, s7, t9 // ac1 += t12*-10704 + t13* 4433 + sra s4, s5, 16 // tmp4 = t11 + addiu a1, a1, 16 + addiu s8, s8, -1 + extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11 + extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11 + addu s2, s5, s4 // tmp2 = t10 + t11 + subu s3, s5, s4 // tmp3 = t10 - t11 + sll s2, s2, 2 // tmp2 = (t10 + t11) << 2 + sll s3, s3, 2 // tmp3 = (t10 - t11) << 2 + sh s2, -16(a1) + sh s3, -8(a1) + sh s0, -12(a1) + bgtz s8, 1b + sh s1, -4(a1) + li t0, 2260 + li t1, 11363 + li t2, 9633 + li t3, 6436 + li t4, 6437 + li t5, 2261 + li t6, 11362 + li t7, 2259 + li t8, 4433 + li t9, 10703 + li a1, 10704 + li s8, 8 + +2: + lh a2, 0(a0) // 0 + lh a3, 16(a0) // 8 + lh v0, 32(a0) // 16 + lh v1, 48(a0) // 24 + lh s4, 64(a0) // 32 + lh s5, 80(a0) // 40 + lh s6, 96(a0) // 48 + lh s7, 112(a0) // 56 + addu s2, v0, s5 // tmp2 = 16 + 40 + subu s5, v0, s5 // tmp5 = 16 - 40 + addu s3, v1, s4 // tmp3 = 24 + 32 + subu s4, v1, s4 // tmp4 = 24 - 32 + addu s0, a2, s7 // tmp0 = 0 + 56 + subu s7, a2, s7 // tmp7 = 0 - 56 + addu s1, a3, s6 // tmp1 = 8 + 48 + subu s6, a3, s6 // tmp6 = 8 - 48 + addu a2, s0, s3 // tmp10 = tmp0 + tmp3 + subu v1, s0, s3 // tmp13 = tmp0 - tmp3 + addu a3, s1, s2 // tmp11 = tmp1 + tmp2 + subu v0, s1, s2 // tmp12 = tmp1 - tmp2 + mult s7, t1 // ac0 = tmp7 * c1 + madd s4, t0 // ac0 += tmp4 * c0 + madd s5, t4 // ac0 += tmp5 * c4 + madd s6, t2 // ac0 += tmp6 * c2 + mult $ac1, s7, t2 // ac1 = tmp7 * c2 + msub $ac1, s4, t3 // ac1 -= tmp4 * c3 + msub $ac1, s5, t6 // ac1 -= tmp5 * c6 + msub $ac1, s6, t7 // ac1 -= tmp6 * c7 + mult $ac2, s7, t4 // ac2 = tmp7 * c4 + madd $ac2, s4, t2 // ac2 += tmp4 * c2 + madd $ac2, s5, t5 // ac2 += tmp5 * c5 + msub $ac2, s6, t6 // ac2 -= tmp6 * c6 + mult $ac3, s7, t0 // ac3 = tmp7 * c0 + msub $ac3, s4, t1 // ac3 -= tmp4 * c1 + madd $ac3, s5, t2 // ac3 += tmp5 * c2 + msub $ac3, s6, t3 // ac3 -= tmp6 * c3 + extr_r.w s0, $ac0, 15 // tmp0 = (ac0 + 16384) >> 15 + extr_r.w s1, $ac1, 15 // tmp1 = (ac1 + 16384) >> 15 + extr_r.w s2, $ac2, 15 // tmp2 = (ac2 + 16384) >> 15 + extr_r.w s3, $ac3, 15 // tmp3 = (ac3 + 16384) >> 15 + addiu s8, s8, -1 + addu s4, a2, a3 // tmp4 = tmp10 + tmp11 + subu s5, a2, a3 // tmp5 = tmp10 - tmp11 + sh s0, 16(a0) + sh s1, 48(a0) + sh s2, 80(a0) + sh s3, 112(a0) + mult v0, t8 // ac0 = tmp12 * c8 + madd v1, t9 // ac0 += tmp13 * c9 + mult $ac1, v1, t8 // ac1 = tmp13 * c8 + msub $ac1, v0, a1 // ac1 -= tmp12 * c10 + addiu a0, a0, 2 + extr_r.w s6, $ac0, 15 // tmp6 = (ac0 + 16384) >> 15 + extr_r.w s7, $ac1, 15 // tmp7 = (ac1 + 16384) >> 15 + shra_r.w s4, s4, 2 // tmp4 = (tmp4 + 2) >> 2 + shra_r.w s5, s5, 2 // tmp5 = (tmp5 + 2) >> 2 + sh s4, -2(a0) + sh s5, 62(a0) + sh s6, 30(a0) + bgtz s8, 2b + sh s7, 94(a0) + + RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8 + + jr ra + nop + +END(jsimd_fdct_islow_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_fdct_ifast_mips_dspr2) +/* + * a0 - data + */ + .set at + SAVE_REGS_ON_STACK 8, s0, s1 + li a1, 0x014e014e // FIX_1_306562965 (334 << 16)|(334 & 0xffff) + li a2, 0x008b008b // FIX_0_541196100 (139 << 16)|(139 & 0xffff) + li a3, 0x00620062 // FIX_0_382683433 (98 << 16) |(98 & 0xffff) + li s1, 0x00b500b5 // FIX_0_707106781 (181 << 16)|(181 & 0xffff) + + move v0, a0 + addiu v1, v0, 128 // end address + +0: + lw t0, 0(v0) // tmp0 = 1|0 + lw t1, 4(v0) // tmp1 = 3|2 + lw t2, 8(v0) // tmp2 = 5|4 + lw t3, 12(v0) // tmp3 = 7|6 + packrl.ph t1, t1, t1 // tmp1 = 2|3 + packrl.ph t3, t3, t3 // tmp3 = 6|7 + subq.ph t7, t1, t2 // tmp7 = 2-5|3-4 = t5|t4 + subq.ph t5, t0, t3 // tmp5 = 1-6|0-7 = t6|t7 + addq.ph t6, t1, t2 // tmp6 = 2+5|3+4 = t2|t3 + addq.ph t4, t0, t3 // tmp4 = 1+6|0+7 = t1|t0 + addq.ph t8, t4, t6 // tmp5 = t1+t2|t0+t3 = t11|t10 + subq.ph t9, t4, t6 // tmp7 = t1-t2|t0-t3 = t12|t13 + sra t4, t8, 16 // tmp4 = t11 + mult $0, $0 // ac0 = 0 + dpa.w.ph $ac0, t9, s1 + mult $ac1, $0, $0 // ac1 = 0 + dpa.w.ph $ac1, t7, a3 // ac1 += t4*98 + t5*98 + dpsx.w.ph $ac1, t5, a3 // ac1 += t6*98 + t7*98 + mult $ac2, $0, $0 // ac2 = 0 + dpa.w.ph $ac2, t7, a2 // ac2 += t4*139 + t5*139 + mult $ac3, $0, $0 // ac3 = 0 + dpa.w.ph $ac3, t5, a1 // ac3 += t6*334 + t7*334 + precrq.ph.w t0, t5, t7 // t0 = t5|t6 + addq.ph t2, t8, t4 // tmp2 = t10 + t11 + subq.ph t3, t8, t4 // tmp3 = t10 - t11 + extr.w t4, $ac0, 8 + mult $0, $0 // ac0 = 0 + dpa.w.ph $ac0, t0, s1 // ac0 += t5*181 + t6*181 + extr.w t0, $ac1, 8 // t0 = z5 + extr.w t1, $ac2, 8 // t1 = MULTIPLY(tmp10, 139) + extr.w t7, $ac3, 8 // t2 = MULTIPLY(tmp12, 334) + extr.w t8, $ac0, 8 // t8 = z3 = MULTIPLY(tmp11, 181) + add t6, t1, t0 // t6 = z2 + add t7, t7, t0 // t7 = z4 + subq.ph t0, t5, t8 // t0 = z13 = tmp7 - z3 + addq.ph t8, t5, t8 // t9 = z11 = tmp7 + z3 + addq.ph t1, t0, t6 // t1 = z13 + z2 + subq.ph t6, t0, t6 // t6 = z13 - z2 + addq.ph t0, t8, t7 // t0 = z11 + z4 + subq.ph t7, t8, t7 // t7 = z11 - z4 + addq.ph t5, t4, t9 + subq.ph t4, t9, t4 + sh t2, 0(v0) + sh t5, 4(v0) + sh t3, 8(v0) + sh t4, 12(v0) + sh t1, 10(v0) + sh t6, 6(v0) + sh t0, 2(v0) + sh t7, 14(v0) + addiu v0, 16 + bne v1, v0, 0b + nop + move v0, a0 + addiu v1, v0, 16 + +1: + lh t0, 0(v0) // 0 + lh t1, 16(v0) // 8 + lh t2, 32(v0) // 16 + lh t3, 48(v0) // 24 + lh t4, 64(v0) // 32 + lh t5, 80(v0) // 40 + lh t6, 96(v0) // 48 + lh t7, 112(v0) // 56 + add t8, t0, t7 // t8 = tmp0 + sub t7, t0, t7 // t7 = tmp7 + add t0, t1, t6 // t0 = tmp1 + sub t1, t1, t6 // t1 = tmp6 + add t6, t2, t5 // t6 = tmp2 + sub t5, t2, t5 // t5 = tmp5 + add t2, t3, t4 // t2 = tmp3 + sub t3, t3, t4 // t3 = tmp4 + add t4, t8, t2 // t4 = tmp10 = tmp0 + tmp3 + sub t8, t8, t2 // t8 = tmp13 = tmp0 - tmp3 + sub s0, t0, t6 // s0 = tmp12 = tmp1 - tmp2 + ins t8, s0, 16, 16 // t8 = tmp12|tmp13 + add t2, t0, t6 // t2 = tmp11 = tmp1 + tmp2 + mult $0, $0 // ac0 = 0 + dpa.w.ph $ac0, t8, s1 // ac0 += t12*181 + t13*181 + add s0, t4, t2 // t8 = tmp10+tmp11 + sub t4, t4, t2 // t4 = tmp10-tmp11 + sh s0, 0(v0) + sh t4, 64(v0) + extr.w t2, $ac0, 8 // z1 = MULTIPLY(tmp12+tmp13,FIX_0_707106781) + addq.ph t4, t8, t2 // t9 = tmp13 + z1 + subq.ph t8, t8, t2 // t2 = tmp13 - z1 + sh t4, 32(v0) + sh t8, 96(v0) + add t3, t3, t5 // t3 = tmp10 = tmp4 + tmp5 + add t0, t5, t1 // t0 = tmp11 = tmp5 + tmp6 + add t1, t1, t7 // t1 = tmp12 = tmp6 + tmp7 + andi t4, a1, 0xffff + mul s0, t1, t4 + sra s0, s0, 8 // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965) + ins t1, t3, 16, 16 // t1 = tmp10|tmp12 + mult $0, $0 // ac0 = 0 + mulsa.w.ph $ac0, t1, a3 // ac0 += t10*98 - t12*98 + extr.w t8, $ac0, 8 // z5 = MULTIPLY(tmp10-tmp12,FIX_0_382683433) + add t2, t7, t8 // t2 = tmp7 + z5 + sub t7, t7, t8 // t7 = tmp7 - z5 + andi t4, a2, 0xffff + mul t8, t3, t4 + sra t8, t8, 8 // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100) + andi t4, s1, 0xffff + mul t6, t0, t4 + sra t6, t6, 8 // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781) + add t0, t6, t8 // t0 = z3 + z2 + sub t1, t6, t8 // t1 = z3 - z2 + add t3, t6, s0 // t3 = z3 + z4 + sub t4, t6, s0 // t4 = z3 - z4 + sub t5, t2, t1 // t5 = dataptr[5] + sub t6, t7, t0 // t6 = dataptr[3] + add t3, t2, t3 // t3 = dataptr[1] + add t4, t7, t4 // t4 = dataptr[7] + sh t5, 80(v0) + sh t6, 48(v0) + sh t3, 16(v0) + sh t4, 112(v0) + addiu v0, 2 + bne v0, v1, 1b + nop + + RESTORE_REGS_FROM_STACK 8, s0, s1 + + j ra + nop +END(jsimd_fdct_ifast_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_quantize_mips_dspr2) +/* + * a0 - coef_block + * a1 - divisors + * a2 - workspace + */ + + .set at + + SAVE_REGS_ON_STACK 16, s0, s1, s2 + + addiu v0, a2, 124 // v0 = workspace_end + lh t0, 0(a2) + lh t1, 0(a1) + lh t2, 128(a1) + sra t3, t0, 15 + sll t3, t3, 1 + addiu t3, t3, 1 + mul t0, t0, t3 + lh t4, 384(a1) + lh t5, 130(a1) + lh t6, 2(a2) + lh t7, 2(a1) + lh t8, 386(a1) + +1: + andi t1, 0xffff + add t9, t0, t2 + andi t9, 0xffff + mul v1, t9, t1 + sra s0, t6, 15 + sll s0, s0, 1 + addiu s0, s0, 1 + addiu t9, t4, 16 + srav v1, v1, t9 + mul v1, v1, t3 + mul t6, t6, s0 + andi t7, 0xffff + addiu a2, a2, 4 + addiu a1, a1, 4 + add s1, t6, t5 + andi s1, 0xffff + sh v1, 0(a0) + + mul s2, s1, t7 + addiu s1, t8, 16 + srav s2, s2, s1 + mul s2,s2, s0 + lh t0, 0(a2) + lh t1, 0(a1) + sra t3, t0, 15 + sll t3, t3, 1 + addiu t3, t3, 1 + mul t0, t0, t3 + lh t2, 128(a1) + lh t4, 384(a1) + lh t5, 130(a1) + lh t8, 386(a1) + lh t6, 2(a2) + lh t7, 2(a1) + sh s2, 2(a0) + lh t0, 0(a2) + sra t3, t0, 15 + sll t3, t3, 1 + addiu t3, t3, 1 + mul t0, t0,t3 + bne a2, v0, 1b + addiu a0, a0, 4 + + andi t1, 0xffff + add t9, t0, t2 + andi t9, 0xffff + mul v1, t9, t1 + sra s0, t6, 15 + sll s0, s0, 1 + addiu s0, s0, 1 + addiu t9, t4, 16 + srav v1, v1, t9 + mul v1, v1, t3 + mul t6, t6, s0 + andi t7, 0xffff + sh v1, 0(a0) + add s1, t6, t5 + andi s1, 0xffff + mul s2, s1, t7 + addiu s1, t8, 16 + addiu a2, a2, 4 + addiu a1, a1, 4 + srav s2, s2, s1 + mul s2, s2, s0 + sh s2, 2(a0) + + RESTORE_REGS_FROM_STACK 16, s0, s1, s2 + + j ra + nop + +END(jsimd_quantize_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_quantize_float_mips_dspr2) +/* + * a0 - coef_block + * a1 - divisors + * a2 - workspace + */ + + .set at + + li t1, 0x46800100 //integer representation 16384.5 + mtc1 t1, f0 + li t0, 63 +0: + lwc1 f2, 0(a2) + lwc1 f10, 0(a1) + lwc1 f4, 4(a2) + lwc1 f12, 4(a1) + lwc1 f6, 8(a2) + lwc1 f14, 8(a1) + lwc1 f8, 12(a2) + lwc1 f16, 12(a1) + madd.s f2, f0, f2, f10 + madd.s f4, f0, f4, f12 + madd.s f6, f0, f6, f14 + madd.s f8, f0, f8, f16 + lwc1 f10, 16(a1) + lwc1 f12, 20(a1) + trunc.w.s f2, f2 + trunc.w.s f4, f4 + trunc.w.s f6, f6 + trunc.w.s f8, f8 + lwc1 f14, 24(a1) + lwc1 f16, 28(a1) + mfc1 t1, f2 + mfc1 t2, f4 + mfc1 t3, f6 + mfc1 t4, f8 + lwc1 f2, 16(a2) + lwc1 f4, 20(a2) + lwc1 f6, 24(a2) + lwc1 f8, 28(a2) + madd.s f2, f0, f2, f10 + madd.s f4, f0, f4, f12 + madd.s f6, f0, f6, f14 + madd.s f8, f0, f8, f16 + addiu t1, t1, -16384 + addiu t2, t2, -16384 + addiu t3, t3, -16384 + addiu t4, t4, -16384 + trunc.w.s f2, f2 + trunc.w.s f4, f4 + trunc.w.s f6, f6 + trunc.w.s f8, f8 + sh t1, 0(a0) + sh t2, 2(a0) + sh t3, 4(a0) + sh t4, 6(a0) + mfc1 t1, f2 + mfc1 t2, f4 + mfc1 t3, f6 + mfc1 t4, f8 + addiu t0, t0, -8 + addiu a2, a2, 32 + addiu a1, a1, 32 + addiu t1, t1, -16384 + addiu t2, t2, -16384 + addiu t3, t3, -16384 + addiu t4, t4, -16384 + sh t1, 8(a0) + sh t2, 10(a0) + sh t3, 12(a0) + sh t4, 14(a0) + bgez t0, 0b + addiu a0, a0, 16 + + j ra + nop + +END(jsimd_quantize_float_mips_dspr2) +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_idct_2x2_mips_dspr2) +/* + * a0 - compptr->dct_table + * a1 - coef_block + * a2 - output_buf + * a3 - output_col + */ + .set at + + SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 + + addiu sp, sp, -40 + move v0, sp + addiu s2, zero, 29692 + addiu s3, zero, -10426 + addiu s4, zero, 6967 + addiu s5, zero, -5906 + lh t0, 0(a1) // t0 = inptr[DCTSIZE*0] + lh t5, 0(a0) // t5 = quantptr[DCTSIZE*0] + lh t1, 48(a1) // t1 = inptr[DCTSIZE*3] + lh t6, 48(a0) // t6 = quantptr[DCTSIZE*3] + mul t4, t5, t0 + lh t0, 16(a1) // t0 = inptr[DCTSIZE*1] + lh t5, 16(a0) // t5 = quantptr[DCTSIZE*1] + mul t6, t6, t1 + mul t5, t5, t0 + lh t2, 80(a1) // t2 = inptr[DCTSIZE*5] + lh t7, 80(a0) // t7 = quantptr[DCTSIZE*5] + lh t3, 112(a1) // t3 = inptr[DCTSIZE*7] + lh t8, 112(a0) // t8 = quantptr[DCTSIZE*7] + mul t7, t7, t2 + mult zero, zero + mul t8, t8, t3 + li s0, 0x73FCD746 // s0 = (29692 << 16) | (-10426 & 0xffff) + li s1, 0x1B37E8EE // s1 = (6967 << 16) | (-5906 & 0xffff) + ins t6, t5, 16, 16 // t6 = t5|t6 + sll t4, t4, 15 + dpa.w.ph $ac0, t6, s0 + lh t1, 2(a1) + lh t6, 2(a0) + ins t8, t7, 16, 16 // t8 = t7|t8 + dpa.w.ph $ac0, t8, s1 + mflo t0, $ac0 + mul t5, t6, t1 + lh t1, 18(a1) + lh t6, 18(a0) + lh t2, 50(a1) + lh t7, 50(a0) + mul t6, t6, t1 + subu t8, t4, t0 + mul t7, t7, t2 + addu t0, t4, t0 + shra_r.w t0, t0, 13 + lh t1, 82(a1) + lh t2, 82(a0) + lh t3, 114(a1) + lh t4, 114(a0) + shra_r.w t8, t8, 13 + mul t1, t1, t2 + mul t3, t3, t4 + sw t0, 0(v0) + sw t8, 20(v0) + sll t4, t5, 15 + ins t7, t6, 16, 16 + mult zero, zero + dpa.w.ph $ac0, t7, s0 + ins t3, t1, 16, 16 + lh t1, 6(a1) + lh t6, 6(a0) + dpa.w.ph $ac0, t3, s1 + mflo t0, $ac0 + mul t5, t6, t1 + lh t1, 22(a1) + lh t6, 22(a0) + lh t2, 54(a1) + lh t7, 54(a0) + mul t6, t6, t1 + subu t8, t4, t0 + mul t7, t7, t2 + addu t0, t4, t0 + shra_r.w t0, t0, 13 + lh t1, 86(a1) + lh t2, 86(a0) + lh t3, 118(a1) + lh t4, 118(a0) + shra_r.w t8, t8, 13 + mul t1, t1, t2 + mul t3, t3, t4 + sw t0, 4(v0) + sw t8, 24(v0) + sll t4, t5, 15 + ins t7, t6, 16, 16 + mult zero, zero + dpa.w.ph $ac0, t7, s0 + ins t3, t1, 16, 16 + lh t1, 10(a1) + lh t6, 10(a0) + dpa.w.ph $ac0, t3, s1 + mflo t0, $ac0 + mul t5, t6, t1 + lh t1, 26(a1) + lh t6, 26(a0) + lh t2, 58(a1) + lh t7, 58(a0) + mul t6, t6, t1 + subu t8, t4, t0 + mul t7, t7, t2 + addu t0, t4, t0 + shra_r.w t0, t0, 13 + lh t1, 90(a1) + lh t2, 90(a0) + lh t3, 122(a1) + lh t4, 122(a0) + shra_r.w t8, t8, 13 + mul t1, t1, t2 + mul t3, t3, t4 + sw t0, 8(v0) + sw t8, 28(v0) + sll t4, t5, 15 + ins t7, t6, 16, 16 + mult zero, zero + dpa.w.ph $ac0, t7, s0 + ins t3, t1, 16, 16 + lh t1, 14(a1) + lh t6, 14(a0) + dpa.w.ph $ac0, t3, s1 + mflo t0, $ac0 + mul t5, t6, t1 + lh t1, 30(a1) + lh t6, 30(a0) + lh t2, 62(a1) + lh t7, 62(a0) + mul t6, t6, t1 + subu t8, t4, t0 + mul t7, t7, t2 + addu t0, t4, t0 + shra_r.w t0, t0, 13 + lh t1, 94(a1) + lh t2, 94(a0) + lh t3, 126(a1) + lh t4, 126(a0) + shra_r.w t8, t8, 13 + mul t1, t1, t2 + mul t3, t3, t4 + sw t0, 12(v0) + sw t8, 32(v0) + sll t4, t5, 15 + ins t7, t6, 16, 16 + mult zero, zero + dpa.w.ph $ac0, t7, s0 + ins t3, t1, 16, 16 + dpa.w.ph $ac0, t3, s1 + mflo t0, $ac0 + lw t9, 0(a2) + lw t3, 0(v0) + lw t7, 4(v0) + lw t1, 8(v0) + addu t9, t9, a3 + sll t3, t3, 15 + subu t8, t4, t0 + addu t0, t4, t0 + shra_r.w t0, t0, 13 + shra_r.w t8, t8, 13 + sw t0, 16(v0) + sw t8, 36(v0) + lw t5, 12(v0) + lw t6, 16(v0) + mult t7, s2 + madd t1, s3 + madd t5, s4 + madd t6, s5 + lw t5, 24(v0) + lw t7, 28(v0) + mflo t0, $ac0 + lw t8, 32(v0) + lw t2, 36(v0) + mult $ac1, t5, s2 + madd $ac1, t7, s3 + madd $ac1, t8, s4 + madd $ac1, t2, s5 + addu t1, t3, t0 + subu t6, t3, t0 + shra_r.w t1, t1, 20 + shra_r.w t6, t6, 20 + mflo t4, $ac1 + shll_s.w t1, t1, 24 + shll_s.w t6, t6, 24 + sra t1, t1, 24 + sra t6, t6, 24 + addiu t1, t1, 128 + addiu t6, t6, 128 + lw t0, 20(v0) + sb t1, 0(t9) + sb t6, 1(t9) + sll t0, t0, 15 + lw t9, 4(a2) + addu t1, t0, t4 + subu t6, t0, t4 + addu t9, t9, a3 + shra_r.w t1, t1, 20 + shra_r.w t6, t6, 20 + shll_s.w t1, t1, 24 + shll_s.w t6, t6, 24 + sra t1, t1, 24 + sra t6, t6, 24 + addiu t1, t1, 128 + addiu t6, t6, 128 + sb t1, 0(t9) + sb t6, 1(t9) + addiu sp, sp, 40 + + RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5 + + j ra + nop + +END(jsimd_idct_2x2_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_idct_4x4_mips_dspr2) +/* + * a0 - compptr->dct_table + * a1 - coef_block + * a2 - output_buf + * a3 - output_col + * 16(sp) - workspace[DCTSIZE*4]; // buffers data between passes + */ + + .set at + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + lw v1, 48(sp) + move t0, a1 + move t1, v1 + li t9, 4 + li s0, 0x2e75f93e + li s1, 0x21f9ba79 + li s2, 0xecc2efb0 + li s3, 0x52031ccd + +0: + lh s6, 32(t0) // inptr[DCTSIZE*2] + lh t6, 32(a0) // quantptr[DCTSIZE*2] + lh s7, 96(t0) // inptr[DCTSIZE*6] + lh t7, 96(a0) // quantptr[DCTSIZE*6] + mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) + lh s4, 0(t0) // inptr[DCTSIZE*0] + mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) + lh s5, 0(a0) // quantptr[0] + li s6, 15137 + li s7, 6270 + mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0]) + mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) + lh t5, 112(t0) // inptr[DCTSIZE*7] + mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) + lh s4, 112(a0) // quantptr[DCTSIZE*7] + lh v0, 80(t0) // inptr[DCTSIZE*5] + lh s5, 80(a0) // quantptr[DCTSIZE*5] + lh s6, 48(a0) // quantptr[DCTSIZE*3] + sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1) + lh s7, 16(a0) // quantptr[DCTSIZE*1] + lh t8, 16(t0) // inptr[DCTSIZE*1] + subu t6, t6, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6) + lh t7, 48(t0) // inptr[DCTSIZE*3] + mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7]) + mul v0, s5, v0 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5]) + mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3]) + mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1]) + addu t3, t2, t6 // tmp10 = tmp0 + z2 + subu t4, t2, t6 // tmp10 = tmp0 - z2 + mult $ac0, zero, zero + mult $ac1, zero, zero + ins t5, v0, 16, 16 + ins t7, t8, 16, 16 + addiu t9, t9, -1 + dpa.w.ph $ac0, t5, s0 + dpa.w.ph $ac0, t7, s1 + dpa.w.ph $ac1, t5, s2 + dpa.w.ph $ac1, t7, s3 + mflo s4, $ac0 + mflo s5, $ac1 + addiu a0, a0, 2 + addiu t1, t1, 4 + addiu t0, t0, 2 + addu t6, t4, s4 + subu t5, t4, s4 + addu s6, t3, s5 + subu s7, t3, s5 + shra_r.w t6, t6, 12 // DESCALE(tmp12 + temp1, 12) + shra_r.w t5, t5, 12 // DESCALE(tmp12 - temp1, 12) + shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12) + shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12) + sw t6, 28(t1) + sw t5, 60(t1) + sw s6, -4(t1) + bgtz t9, 0b + sw s7, 92(t1) + // second loop three pass + li t9, 3 +1: + lh s6, 34(t0) // inptr[DCTSIZE*2] + lh t6, 34(a0) // quantptr[DCTSIZE*2] + lh s7, 98(t0) // inptr[DCTSIZE*6] + lh t7, 98(a0) // quantptr[DCTSIZE*6] + mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) + lh s4, 2(t0) // inptr[DCTSIZE*0] + mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) + lh s5, 2(a0) // quantptr[DCTSIZE*0] + li s6, 15137 + li s7, 6270 + mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0]) + mul v0, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) + lh t5, 114(t0) // inptr[DCTSIZE*7] + mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) + lh s4, 114(a0) // quantptr[DCTSIZE*7] + lh s5, 82(a0) // quantptr[DCTSIZE*5] + lh t6, 82(t0) // inptr[DCTSIZE*5] + sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1) + lh s6, 50(a0) // quantptr[DCTSIZE*3] + lh t8, 18(t0) // inptr[DCTSIZE*1] + subu v0, v0, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6) + lh t7, 50(t0) // inptr[DCTSIZE*3] + lh s7, 18(a0) // quantptr[DCTSIZE*1] + mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7]) + mul t6, s5, t6 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5]) + mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3]) + mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1]) + addu t3, t2, v0 // tmp10 = tmp0 + z2 + subu t4, t2, v0 // tmp10 = tmp0 - z2 + mult $ac0, zero, zero + mult $ac1, zero, zero + ins t5, t6, 16, 16 + ins t7, t8, 16, 16 + dpa.w.ph $ac0, t5, s0 + dpa.w.ph $ac0, t7, s1 + dpa.w.ph $ac1, t5, s2 + dpa.w.ph $ac1, t7, s3 + mflo t5, $ac0 + mflo t6, $ac1 + addiu t9, t9, -1 + addiu t0, t0, 2 + addiu a0, a0, 2 + addiu t1, t1, 4 + addu s5, t4, t5 + subu s4, t4, t5 + addu s6, t3, t6 + subu s7, t3, t6 + shra_r.w s5, s5, 12 // DESCALE(tmp12 + temp1, 12) + shra_r.w s4, s4, 12 // DESCALE(tmp12 - temp1, 12) + shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12) + shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12) + sw s5, 32(t1) + sw s4, 64(t1) + sw s6, 0(t1) + bgtz t9, 1b + sw s7, 96(t1) + move t1, v1 + li s4, 15137 + lw s6, 8(t1) // wsptr[2] + li s5, 6270 + lw s7, 24(t1) // wsptr[6] + mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065) + lw t2, 0(t1) // wsptr[0] + mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865) + lh t5, 28(t1) // wsptr[7] + lh t6, 20(t1) // wsptr[5] + lh t7, 12(t1) // wsptr[3] + lh t8, 4(t1) // wsptr[1] + ins t5, t6, 16, 16 + ins t7, t8, 16, 16 + mult $ac0, zero, zero + dpa.w.ph $ac0, t5, s0 + dpa.w.ph $ac0, t7, s1 + mult $ac1, zero, zero + dpa.w.ph $ac1, t5, s2 + dpa.w.ph $ac1, t7, s3 + sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1) + mflo s6, $ac0 + // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) + subu s4, s4, s5 + addu t3, t2, s4 // tmp10 = tmp0 + z2 + mflo s7, $ac1 + subu t4, t2, s4 // tmp10 = tmp0 - z2 + addu t7, t4, s6 + subu t8, t4, s6 + addu t5, t3, s7 + subu t6, t3, s7 + shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) + shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) + shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) + shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) + sll s4, t9, 2 + lw v0, 0(a2) // output_buf[ctr] + shll_s.w t5, t5, 24 + shll_s.w t6, t6, 24 + shll_s.w t7, t7, 24 + shll_s.w t8, t8, 24 + sra t5, t5, 24 + sra t6, t6, 24 + sra t7, t7, 24 + sra t8, t8, 24 + addu v0, v0, a3 // outptr = output_buf[ctr] + output_col + addiu t5, t5, 128 + addiu t6, t6, 128 + addiu t7, t7, 128 + addiu t8, t8, 128 + sb t5, 0(v0) + sb t7, 1(v0) + sb t8, 2(v0) + sb t6, 3(v0) + // 2 + li s4, 15137 + lw s6, 40(t1) // wsptr[2] + li s5, 6270 + lw s7, 56(t1) // wsptr[6] + mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065) + lw t2, 32(t1) // wsptr[0] + mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865) + lh t5, 60(t1) // wsptr[7] + lh t6, 52(t1) // wsptr[5] + lh t7, 44(t1) // wsptr[3] + lh t8, 36(t1) // wsptr[1] + ins t5, t6, 16, 16 + ins t7, t8, 16, 16 + mult $ac0, zero, zero + dpa.w.ph $ac0, t5, s0 + dpa.w.ph $ac0, t7, s1 + mult $ac1, zero, zero + dpa.w.ph $ac1, t5, s2 + dpa.w.ph $ac1, t7, s3 + sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1) + mflo s6, $ac0 + // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) + subu s4, s4, s5 + addu t3, t2, s4 // tmp10 = tmp0 + z2 + mflo s7, $ac1 + subu t4, t2, s4 // tmp10 = tmp0 - z2 + addu t7, t4, s6 + subu t8, t4, s6 + addu t5, t3, s7 + subu t6, t3, s7 + shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1) + shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1) + shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1) + shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1) + sll s4, t9, 2 + lw v0, 4(a2) // output_buf[ctr] + shll_s.w t5, t5, 24 + shll_s.w t6, t6, 24 + shll_s.w t7, t7, 24 + shll_s.w t8, t8, 24 + sra t5, t5, 24 + sra t6, t6, 24 + sra t7, t7, 24 + sra t8, t8, 24 + addu v0, v0, a3 // outptr = output_buf[ctr] + output_col + addiu t5, t5, 128 + addiu t6, t6, 128 + addiu t7, t7, 128 + addiu t8, t8, 128 + sb t5, 0(v0) + sb t7, 1(v0) + sb t8, 2(v0) + sb t6, 3(v0) + // 3 + li s4, 15137 + lw s6, 72(t1) // wsptr[2] + li s5, 6270 + lw s7, 88(t1) // wsptr[6] + mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065) + lw t2, 64(t1) // wsptr[0] + mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865) + lh t5, 92(t1) // wsptr[7] + lh t6, 84(t1) // wsptr[5] + lh t7, 76(t1) // wsptr[3] + lh t8, 68(t1) // wsptr[1] + ins t5, t6, 16, 16 + ins t7, t8, 16, 16 + mult $ac0, zero, zero + dpa.w.ph $ac0, t5, s0 + dpa.w.ph $ac0, t7, s1 + mult $ac1, zero, zero + dpa.w.ph $ac1, t5, s2 + dpa.w.ph $ac1, t7, s3 + sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1) + mflo s6, $ac0 + // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) + subu s4, s4, s5 + addu t3, t2, s4 // tmp10 = tmp0 + z2 + mflo s7, $ac1 + subu t4, t2, s4 // tmp10 = tmp0 - z2 + addu t7, t4, s6 + subu t8, t4, s6 + addu t5, t3, s7 + subu t6, t3, s7 + shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) + shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) + shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) + shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) + sll s4, t9, 2 + lw v0, 8(a2) // output_buf[ctr] + shll_s.w t5, t5, 24 + shll_s.w t6, t6, 24 + shll_s.w t7, t7, 24 + shll_s.w t8, t8, 24 + sra t5, t5, 24 + sra t6, t6, 24 + sra t7, t7, 24 + sra t8, t8, 24 + addu v0, v0, a3 // outptr = output_buf[ctr] + output_col + addiu t5, t5, 128 + addiu t6, t6, 128 + addiu t7, t7, 128 + addiu t8, t8, 128 + sb t5, 0(v0) + sb t7, 1(v0) + sb t8, 2(v0) + sb t6, 3(v0) + li s4, 15137 + lw s6, 104(t1) // wsptr[2] + li s5, 6270 + lw s7, 120(t1) // wsptr[6] + mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065) + lw t2, 96(t1) // wsptr[0] + mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], -FIX_0_765366865) + lh t5, 124(t1) // wsptr[7] + lh t6, 116(t1) // wsptr[5] + lh t7, 108(t1) // wsptr[3] + lh t8, 100(t1) // wsptr[1] + ins t5, t6, 16, 16 + ins t7, t8, 16, 16 + mult $ac0, zero, zero + dpa.w.ph $ac0, t5, s0 + dpa.w.ph $ac0, t7, s1 + mult $ac1, zero, zero + dpa.w.ph $ac1, t5, s2 + dpa.w.ph $ac1, t7, s3 + sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1) + mflo s6, $ac0 + // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) + subu s4, s4, s5 + addu t3, t2, s4 // tmp10 = tmp0 + z2; + mflo s7, $ac1 + subu t4, t2, s4 // tmp10 = tmp0 - z2; + addu t7, t4, s6 + subu t8, t4, s6 + addu t5, t3, s7 + subu t6, t3, s7 + shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) + shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) + shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) + shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) + sll s4, t9, 2 + lw v0, 12(a2) // output_buf[ctr] + shll_s.w t5, t5, 24 + shll_s.w t6, t6, 24 + shll_s.w t7, t7, 24 + shll_s.w t8, t8, 24 + sra t5, t5, 24 + sra t6, t6, 24 + sra t7, t7, 24 + sra t8, t8, 24 + addu v0, v0, a3 // outptr = output_buf[ctr] + output_col + addiu t5, t5, 128 + addiu t6, t6, 128 + addiu t7, t7, 128 + addiu t8, t8, 128 + sb t5, 0(v0) + sb t7, 1(v0) + sb t8, 2(v0) + sb t6, 3(v0) + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop +END(jsimd_idct_4x4_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_idct_6x6_mips_dspr2) +/* + * a0 - compptr->dct_table + * a1 - coef_block + * a2 - output_buf + * a3 - output_col + */ + .set at + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + addiu sp, sp, -144 + move v0, sp + addiu v1, v0, 24 + addiu t9, zero, 5793 + addiu s0, zero, 10033 + addiu s1, zero, 2998 + +1: + lh s2, 0(a0) // q0 = quantptr[ 0] + lh s3, 32(a0) // q1 = quantptr[16] + lh s4, 64(a0) // q2 = quantptr[32] + lh t2, 64(a1) // tmp2 = inptr[32] + lh t1, 32(a1) // tmp1 = inptr[16] + lh t0, 0(a1) // tmp0 = inptr[ 0] + mul t2, t2, s4 // tmp2 = tmp2 * q2 + mul t1, t1, s3 // tmp1 = tmp1 * q1 + mul t0, t0, s2 // tmp0 = tmp0 * q0 + lh t6, 16(a1) // z1 = inptr[ 8] + lh t8, 80(a1) // z3 = inptr[40] + lh t7, 48(a1) // z2 = inptr[24] + lh s2, 16(a0) // q0 = quantptr[ 8] + lh s4, 80(a0) // q2 = quantptr[40] + lh s3, 48(a0) // q1 = quantptr[24] + mul t2, t2, t9 // tmp2 = tmp2 * 5793 + mul t1, t1, s0 // tmp1 = tmp1 * 10033 + sll t0, t0, 13 // tmp0 = tmp0 << 13 + mul t6, t6, s2 // z1 = z1 * q0 + mul t8, t8, s4 // z3 = z3 * q2 + mul t7, t7, s3 // z2 = z2 * q1 + addu t3, t0, t2 // tmp10 = tmp0 + tmp2 + sll t2, t2, 1 // tmp2 = tmp2 << 2 + subu t4, t0, t2 // tmp11 = tmp0 - tmp2; + subu t5, t3, t1 // tmp12 = tmp10 - tmp1 + addu t3, t3, t1 // tmp10 = tmp10 + tmp1 + addu t1, t6, t8 // tmp1 = z1 + z3 + mul t1, t1, s1 // tmp1 = tmp1 * 2998 + shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11 + subu t2, t6, t8 // tmp2 = z1 - z3 + subu t2, t2, t7 // tmp2 = tmp2 - z2 + sll t2, t2, 2 // tmp2 = tmp2 << 2 + addu t0, t6, t7 // tmp0 = z1 + z2 + sll t0, t0, 13 // tmp0 = tmp0 << 13 + subu s2, t8, t7 // q0 = z3 - z2 + sll s2, s2, 13 // q0 = q0 << 13 + addu t0, t0, t1 // tmp0 = tmp0 + tmp1 + addu t1, s2, t1 // tmp1 = q0 + tmp1 + addu s2, t4, t2 // q0 = tmp11 + tmp2 + subu s3, t4, t2 // q1 = tmp11 - tmp2 + addu t6, t3, t0 // z1 = tmp10 + tmp0 + subu t7, t3, t0 // z2 = tmp10 - tmp0 + addu t4, t5, t1 // tmp11 = tmp12 + tmp1 + subu t5, t5, t1 // tmp12 = tmp12 - tmp1 + shra_r.w t6, t6, 11 // z1 = (z1 + 1024) >> 11 + shra_r.w t7, t7, 11 // z2 = (z2 + 1024) >> 11 + shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11 + shra_r.w t5, t5, 11 // tmp12 = (tmp12 + 1024) >> 11 + sw s2, 24(v0) + sw s3, 96(v0) + sw t6, 0(v0) + sw t7, 120(v0) + sw t4, 48(v0) + sw t5, 72(v0) + addiu v0, v0, 4 + addiu a1, a1, 2 + bne v0, v1, 1b + addiu a0, a0, 2 + + /* Pass 2: process 6 rows from work array, store into output array. */ + move v0, sp + addiu v1, v0, 144 + +2: + lw t0, 0(v0) + lw t2, 16(v0) + lw s5, 0(a2) + addiu t0, t0, 16 + sll t0, t0, 13 + mul t3, t2, t9 + lw t6, 4(v0) + lw t8, 20(v0) + lw t7, 12(v0) + addu s5, s5, a3 + addu s6, t6, t8 + mul s6, s6, s1 + addu t1, t0, t3 + subu t4, t0, t3 + subu t4, t4, t3 + lw t3, 8(v0) + mul t0, t3, s0 + addu s7, t6, t7 + sll s7, s7, 13 + addu s7, s6, s7 + subu t2, t8, t7 + sll t2, t2, 13 + addu t2, s6, t2 + subu s6, t6, t7 + subu s6, s6, t8 + sll s6, s6, 13 + addu t3, t1, t0 + subu t5, t1, t0 + addu t6, t3, s7 + subu t3, t3, s7 + addu t7, t4, s6 + subu t4, t4, s6 + addu t8, t5, t2 + subu t5, t5, t2 + shll_s.w t6, t6, 6 + shll_s.w t3, t3, 6 + shll_s.w t7, t7, 6 + shll_s.w t4, t4, 6 + shll_s.w t8, t8, 6 + shll_s.w t5, t5, 6 + sra t6, t6, 24 + addiu t6, t6, 128 + sra t3, t3, 24 + addiu t3, t3, 128 + sb t6, 0(s5) + sra t7, t7, 24 + addiu t7, t7, 128 + sb t3, 5(s5) + sra t4, t4, 24 + addiu t4, t4, 128 + sb t7, 1(s5) + sra t8, t8, 24 + addiu t8, t8, 128 + sb t4, 4(s5) + addiu v0, v0, 24 + sra t5, t5, 24 + addiu t5, t5, 128 + sb t8, 2(s5) + addiu a2, a2, 4 + bne v0, v1, 2b + sb t5, 3(s5) + + addiu sp, sp, 144 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop + +END(jsimd_idct_6x6_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass1_mips_dspr2) +/* + * a0 - compptr->dct_table + * a1 - coef_block + * a2 - workspace + */ + + SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 + + li a3, 8 + +1: + // odd part + lh t0, 48(a1) + lh t1, 48(a0) + lh t2, 16(a1) + lh t3, 16(a0) + lh t4, 80(a1) + lh t5, 80(a0) + lh t6, 112(a1) + lh t7, 112(a0) + mul t0, t0, t1 // z2 + mul t1, t2, t3 // z1 + mul t2, t4, t5 // z3 + mul t3, t6, t7 // z4 + li t4, 10703 // FIX(1.306562965) + li t5, 4433 // FIX_0_541196100 + li t6, 7053 // FIX(0.860918669) + mul t4, t0,t4 // tmp11 + mul t5, t0,t5 // -tmp14 + addu t7, t1,t2 // tmp10 + addu t8, t7,t3 // tmp10 + z4 + mul t6, t6, t8 // tmp15 + li t8, 2139 // FIX(0.261052384) + mul t8, t7, t8 // MULTIPLY(tmp10, FIX(0.261052384)) + li t7, 2295 // FIX(0.280143716) + mul t7, t1, t7 // MULTIPLY(z1, FIX(0.280143716)) + addu t9, t2, t3 // z3 + z4 + li s0, 8565 // FIX(1.045510580) + mul t9, t9, s0 // -tmp13 + li s0, 12112 // FIX(1.478575242) + mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242) + li s1, 12998 // FIX(1.586706681) + mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681)) + li s2, 5540 // FIX(0.676326758) + mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758)) + li s3, 16244 // FIX(1.982889723) + mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723)) + subu t1, t1, t3 // z1-=z4 + subu t0, t0, t2 // z2-=z3 + addu t2, t0, t1 // z1+z2 + li t3, 4433 // FIX_0_541196100 + mul t2, t2, t3 // z3 + li t3, 6270 // FIX_0_765366865 + mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865) + li t3, 15137 // FIX_0_765366865 + mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065) + addu t8, t6, t8 // tmp12 + addu t3, t8, t4 // tmp12 + tmp11 + addu t3, t3, t7 // tmp10 + subu t8, t8, t9 // tmp12 + tmp13 + addu s0, t5, s0 + subu t8, t8, s0 // tmp12 + subu t9, t6, t9 + subu s1, s1, t4 + addu t9, t9, s1 // tmp13 + subu t6, t6, t5 + subu t6, t6, s2 + subu t6, t6, s3 // tmp15 + // even part start + lh t4, 64(a1) + lh t5, 64(a0) + lh t7, 32(a1) + lh s0, 32(a0) + lh s1, 0(a1) + lh s2, 0(a0) + lh s3, 96(a1) + lh v0, 96(a0) + mul t4, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*4],quantptr[DCTSIZE*4]) + mul t5, t7, s0 // DEQUANTIZE(inptr[DCTSIZE*2],quantptr[DCTSIZE*2]) + mul t7, s1, s2 // DEQUANTIZE(inptr[DCTSIZE*0],quantptr[DCTSIZE*0]) + mul s0, s3, v0 // DEQUANTIZE(inptr[DCTSIZE*6],quantptr[DCTSIZE*6]) + // odd part end + addu t1, t2, t1 // tmp11 + subu t0, t2, t0 // tmp14 + // update counter and pointers + addiu a3, a3, -1 + addiu a0, a0, 2 + addiu a1, a1, 2 + // even part rest + li s1, 10033 + li s2, 11190 + mul t4, t4, s1 // z4 + mul s1, t5, s2 // z4 + sll t5, t5, 13 // z1 + sll t7, t7, 13 + addiu t7, t7, 1024 // z3 + sll s0, s0, 13 // z2 + addu s2, t7, t4 // tmp10 + subu t4, t7, t4 // tmp11 + subu s3, t5, s0 // tmp12 + addu t2, t7, s3 // tmp21 + subu s3, t7, s3 // tmp24 + addu t7, s1, s0 // tmp12 + addu v0, s2, t7 // tmp20 + subu s2, s2, t7 // tmp25 + subu s1, s1, t5 // z4 - z1 + subu s1, s1, s0 // tmp12 + addu s0, t4, s1 // tmp22 + subu t4, t4, s1 // tmp23 + // final output stage + addu t5, v0, t3 + subu v0, v0, t3 + addu t3, t2, t1 + subu t2, t2, t1 + addu t1, s0, t8 + subu s0, s0, t8 + addu t8, t4, t9 + subu t4, t4, t9 + addu t9, s3, t0 + subu s3, s3, t0 + addu t0, s2, t6 + subu s2, s2, t6 + sra t5, t5, 11 + sra t3, t3, 11 + sra t1, t1, 11 + sra t8, t8, 11 + sra t9, t9, 11 + sra t0, t0, 11 + sra s2, s2, 11 + sra s3, s3, 11 + sra t4, t4, 11 + sra s0, s0, 11 + sra t2, t2, 11 + sra v0, v0, 11 + sw t5, 0(a2) + sw t3, 32(a2) + sw t1, 64(a2) + sw t8, 96(a2) + sw t9, 128(a2) + sw t0, 160(a2) + sw s2, 192(a2) + sw s3, 224(a2) + sw t4, 256(a2) + sw s0, 288(a2) + sw t2, 320(a2) + sw v0, 352(a2) + bgtz a3, 1b + addiu a2, a2, 4 + + RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 + + j ra + nop + +END(jsimd_idct_12x12_pass1_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass2_mips_dspr2) +/* + * a0 - workspace + * a1 - output + */ + + SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 + + li a3, 12 + +1: + // Odd part + lw t0, 12(a0) + lw t1, 4(a0) + lw t2, 20(a0) + lw t3, 28(a0) + li t4, 10703 // FIX(1.306562965) + li t5, 4433 // FIX_0_541196100 + mul t4, t0, t4 // tmp11 + mul t5, t0, t5 // -tmp14 + addu t6, t1, t2 // tmp10 + li t7, 2139 // FIX(0.261052384) + mul t7, t6, t7 // MULTIPLY(tmp10, FIX(0.261052384)) + addu t6, t6, t3 // tmp10 + z4 + li t8, 7053 // FIX(0.860918669) + mul t6, t6, t8 // tmp15 + li t8, 2295 // FIX(0.280143716) + mul t8, t1, t8 // MULTIPLY(z1, FIX(0.280143716)) + addu t9, t2, t3 // z3 + z4 + li s0, 8565 // FIX(1.045510580) + mul t9, t9, s0 // -tmp13 + li s0, 12112 // FIX(1.478575242) + mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242)) + li s1, 12998 // FIX(1.586706681) + mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681)) + li s2, 5540 // FIX(0.676326758) + mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758)) + li s3, 16244 // FIX(1.982889723) + mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723)) + subu t1, t1, t3 // z1 -= z4 + subu t0, t0, t2 // z2 -= z3 + addu t2, t1, t0 // z1 + z2 + li t3, 4433 // FIX_0_541196100 + mul t2, t2, t3 // z3 + li t3, 6270 // FIX_0_765366865 + mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865) + li t3, 15137 // FIX_1_847759065 + mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065) + addu t3, t6, t7 // tmp12 + addu t7, t3, t4 + addu t7, t7, t8 // tmp10 + subu t3, t3, t9 + subu t3, t3, t5 + subu t3, t3, s0 // tmp12 + subu t9, t6, t9 + subu t9, t9, t4 + addu t9, t9, s1 // tmp13 + subu t6, t6, t5 + subu t6, t6, s2 + subu t6, t6, s3 // tmp15 + addu t1, t2, t1 // tmp11 + subu t0, t2, t0 // tmp14 + // even part + lw t2, 16(a0) // z4 + lw t4, 8(a0) // z1 + lw t5, 0(a0) // z3 + lw t8, 24(a0) // z2 + li s0, 10033 // FIX(1.224744871) + li s1, 11190 // FIX(1.366025404) + mul t2, t2, s0 // z4 + mul s0, t4, s1 // z4 + addiu t5, t5, 0x10 + sll t5, t5, 13 // z3 + sll t4, t4, 13 // z1 + sll t8, t8, 13 // z2 + subu s1, t4, t8 // tmp12 + addu s2, t5, t2 // tmp10 + subu t2, t5, t2 // tmp11 + addu s3, t5, s1 // tmp21 + subu s1, t5, s1 // tmp24 + addu t5, s0, t8 // tmp12 + addu v0, s2, t5 // tmp20 + subu t5, s2, t5 // tmp25 + subu t4, s0, t4 + subu t4, t4, t8 // tmp12 + addu t8, t2, t4 // tmp22 + subu t2, t2, t4 // tmp23 + // increment counter and pointers + addiu a3, a3, -1 + addiu a0, a0, 32 + // Final stage + addu t4, v0, t7 + subu v0, v0, t7 + addu t7, s3, t1 + subu s3, s3, t1 + addu t1, t8, t3 + subu t8, t8, t3 + addu t3, t2, t9 + subu t2, t2, t9 + addu t9, s1, t0 + subu s1, s1, t0 + addu t0, t5, t6 + subu t5, t5, t6 + sll t4, t4, 4 + sll t7, t7, 4 + sll t1, t1, 4 + sll t3, t3, 4 + sll t9, t9, 4 + sll t0, t0, 4 + sll t5, t5, 4 + sll s1, s1, 4 + sll t2, t2, 4 + sll t8, t8, 4 + sll s3, s3, 4 + sll v0, v0, 4 + shll_s.w t4, t4, 2 + shll_s.w t7, t7, 2 + shll_s.w t1, t1, 2 + shll_s.w t3, t3, 2 + shll_s.w t9, t9, 2 + shll_s.w t0, t0, 2 + shll_s.w t5, t5, 2 + shll_s.w s1, s1, 2 + shll_s.w t2, t2, 2 + shll_s.w t8, t8, 2 + shll_s.w s3, s3, 2 + shll_s.w v0, v0, 2 + srl t4, t4, 24 + srl t7, t7, 24 + srl t1, t1, 24 + srl t3, t3, 24 + srl t9, t9, 24 + srl t0, t0, 24 + srl t5, t5, 24 + srl s1, s1, 24 + srl t2, t2, 24 + srl t8, t8, 24 + srl s3, s3, 24 + srl v0, v0, 24 + lw t6, 0(a1) + addiu t4, t4, 0x80 + addiu t7, t7, 0x80 + addiu t1, t1, 0x80 + addiu t3, t3, 0x80 + addiu t9, t9, 0x80 + addiu t0, t0, 0x80 + addiu t5, t5, 0x80 + addiu s1, s1, 0x80 + addiu t2, t2, 0x80 + addiu t8, t8, 0x80 + addiu s3, s3, 0x80 + addiu v0, v0, 0x80 + sb t4, 0(t6) + sb t7, 1(t6) + sb t1, 2(t6) + sb t3, 3(t6) + sb t9, 4(t6) + sb t0, 5(t6) + sb t5, 6(t6) + sb s1, 7(t6) + sb t2, 8(t6) + sb t8, 9(t6) + sb s3, 10(t6) + sb v0, 11(t6) + bgtz a3, 1b + addiu a1, a1, 4 + + RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 + + jr ra + nop + +END(jsimd_idct_12x12_pass2_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_convsamp_mips_dspr2) +/* + * a0 - sample_data + * a1 - start_col + * a2 - workspace + */ + + lw t0, 0(a0) + li t7, 0xff80ff80 + addu t0, t0, a1 + ulw t1, 0(t0) + ulw t2, 4(t0) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + lw t0, 4(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 0(a2) + usw t4, 4(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 8(a2) + usw t6, 12(a2) + + lw t0, 8(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 16(a2) + usw t4, 20(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 24(a2) + usw t6, 28(a2) + + lw t0, 12(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 32(a2) + usw t4, 36(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 40(a2) + usw t6, 44(a2) + + lw t0, 16(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 48(a2) + usw t4, 52(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 56(a2) + usw t6, 60(a2) + + lw t0, 20(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 64(a2) + usw t4, 68(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 72(a2) + usw t6, 76(a2) + + lw t0, 24(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 80(a2) + usw t4, 84(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 88(a2) + usw t6, 92(a2) + + lw t0, 28(a0) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu t0, t0, a1 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + ulw t1, 0(t0) + ulw t2, 4(t0) + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 96(a2) + usw t4, 100(a2) + preceu.ph.qbr t3, t1 + preceu.ph.qbl t4, t1 + usw t5, 104(a2) + usw t6, 108(a2) + preceu.ph.qbr t5, t2 + preceu.ph.qbl t6, t2 + addu.ph t3, t3, t7 + addu.ph t4, t4, t7 + addu.ph t5, t5, t7 + addu.ph t6, t6, t7 + usw t3, 112(a2) + usw t4, 116(a2) + usw t5, 120(a2) + usw t6, 124(a2) + + j ra + nop + +END(jsimd_convsamp_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_convsamp_float_mips_dspr2) +/* + * a0 - sample_data + * a1 - start_col + * a2 - workspace + */ + + .set at + + lw t0, 0(a0) + addu t0, t0, a1 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 4(a0) + swc1 f2, 0(a2) + swc1 f4, 4(a2) + swc1 f6, 8(a2) + addu t0, t0, a1 + swc1 f8, 12(a2) + swc1 f10, 16(a2) + swc1 f12, 20(a2) + swc1 f14, 24(a2) + swc1 f16, 28(a2) + //elemr 1 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 8(a0) + swc1 f2, 32(a2) + swc1 f4, 36(a2) + swc1 f6, 40(a2) + addu t0, t0, a1 + swc1 f8, 44(a2) + swc1 f10, 48(a2) + swc1 f12, 52(a2) + swc1 f14, 56(a2) + swc1 f16, 60(a2) + //elemr 2 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 12(a0) + swc1 f2, 64(a2) + swc1 f4, 68(a2) + swc1 f6, 72(a2) + addu t0, t0, a1 + swc1 f8, 76(a2) + swc1 f10, 80(a2) + swc1 f12, 84(a2) + swc1 f14, 88(a2) + swc1 f16, 92(a2) + //elemr 3 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 16(a0) + swc1 f2, 96(a2) + swc1 f4, 100(a2) + swc1 f6, 104(a2) + addu t0, t0, a1 + swc1 f8, 108(a2) + swc1 f10, 112(a2) + swc1 f12, 116(a2) + swc1 f14, 120(a2) + swc1 f16, 124(a2) + //elemr 4 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 20(a0) + swc1 f2, 128(a2) + swc1 f4, 132(a2) + swc1 f6, 136(a2) + addu t0, t0, a1 + swc1 f8, 140(a2) + swc1 f10, 144(a2) + swc1 f12, 148(a2) + swc1 f14, 152(a2) + swc1 f16, 156(a2) + //elemr 5 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 24(a0) + swc1 f2, 160(a2) + swc1 f4, 164(a2) + swc1 f6, 168(a2) + addu t0, t0, a1 + swc1 f8, 172(a2) + swc1 f10, 176(a2) + swc1 f12, 180(a2) + swc1 f14, 184(a2) + swc1 f16, 188(a2) + //elemr 6 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + lw t0, 28(a0) + swc1 f2, 192(a2) + swc1 f4, 196(a2) + swc1 f6, 200(a2) + addu t0, t0, a1 + swc1 f8, 204(a2) + swc1 f10, 208(a2) + swc1 f12, 212(a2) + swc1 f14, 216(a2) + swc1 f16, 220(a2) + //elemr 7 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f2 + mtc1 t2, f4 + mtc1 t3, f6 + mtc1 t4, f8 + mtc1 t5, f10 + mtc1 t6, f12 + mtc1 t7, f14 + mtc1 t8, f16 + cvt.s.w f2, f2 + cvt.s.w f4, f4 + cvt.s.w f6, f6 + cvt.s.w f8, f8 + cvt.s.w f10, f10 + cvt.s.w f12, f12 + cvt.s.w f14, f14 + cvt.s.w f16, f16 + swc1 f2, 224(a2) + swc1 f4, 228(a2) + swc1 f6, 232(a2) + swc1 f8, 236(a2) + swc1 f10, 240(a2) + swc1 f12, 244(a2) + swc1 f14, 248(a2) + swc1 f16, 252(a2) + + j ra + nop + +END(jsimd_convsamp_float_mips_dspr2) + +/*****************************************************************************/ + diff --git a/media/libjpeg/simd/jsimd_mips_dspr2_asm.h b/media/libjpeg/simd/jsimd_mips_dspr2_asm.h new file mode 100644 index 000000000..64f988048 --- /dev/null +++ b/media/libjpeg/simd/jsimd_mips_dspr2_asm.h @@ -0,0 +1,285 @@ +/* + * MIPS DSPr2 optimizations for libjpeg-turbo + * + * Copyright (C) 2013, MIPS Technologies, Inc., California. + * All Rights Reserved. + * Authors: Teodora Novkovic (teodora.novkovic@imgtec.com) + * Darko Laus (darko.laus@imgtec.com) + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define zero $0 +#define AT $1 +#define v0 $2 +#define v1 $3 +#define a0 $4 +#define a1 $5 +#define a2 $6 +#define a3 $7 +#define t0 $8 +#define t1 $9 +#define t2 $10 +#define t3 $11 +#define t4 $12 +#define t5 $13 +#define t6 $14 +#define t7 $15 +#define s0 $16 +#define s1 $17 +#define s2 $18 +#define s3 $19 +#define s4 $20 +#define s5 $21 +#define s6 $22 +#define s7 $23 +#define t8 $24 +#define t9 $25 +#define k0 $26 +#define k1 $27 +#define gp $28 +#define sp $29 +#define fp $30 +#define s8 $30 +#define ra $31 + +#define f0 $f0 +#define f1 $f1 +#define f2 $f2 +#define f3 $f3 +#define f4 $f4 +#define f5 $f5 +#define f6 $f6 +#define f7 $f7 +#define f8 $f8 +#define f9 $f9 +#define f10 $f10 +#define f11 $f11 +#define f12 $f12 +#define f13 $f13 +#define f14 $f14 +#define f15 $f15 +#define f16 $f16 +#define f17 $f17 +#define f18 $f18 +#define f19 $f19 +#define f20 $f20 +#define f21 $f21 +#define f22 $f22 +#define f23 $f23 +#define f24 $f24 +#define f25 $f25 +#define f26 $f26 +#define f27 $f27 +#define f28 $f28 +#define f29 $f29 +#define f30 $f30 +#define f31 $f31 + +/* + * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2 + */ +#define LEAF_MIPS32R2(symbol) \ + .globl symbol; \ + .align 2; \ + .type symbol, @function; \ + .ent symbol, 0; \ +symbol: .frame sp, 0, ra; \ + .set push; \ + .set arch=mips32r2; \ + .set noreorder; \ + .set noat; + +/* + * LEAF_MIPS_DSPR2 - declare leaf routine for MIPS DSPr2 + */ +#define LEAF_MIPS_DSPR2(symbol) \ +LEAF_MIPS32R2(symbol) \ + .set dspr2; + +/* + * END - mark end of function + */ +#define END(function) \ + .set pop; \ + .end function; \ + .size function,.-function + +/* + * Checks if stack offset is big enough for storing/restoring regs_num + * number of register to/from stack. Stack offset must be greater than + * or equal to the number of bytes needed for storing registers (regs_num*4). + * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is + * preserved for input arguments of the functions, already stored in a0-a3), + * stack size can be further optimized by utilizing this space. + */ +.macro CHECK_STACK_OFFSET regs_num, stack_offset +.if \stack_offset < \regs_num * 4 - 16 +.error "Stack offset too small." +.endif +.endm + +/* + * Saves set of registers on stack. Maximum number of registers that + * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7). + * Stack offset is number of bytes that are added to stack pointer (sp) + * before registers are pushed in order to provide enough space on stack + * (offset must be multiple of 4, and must be big enough, as described by + * CHECK_STACK_OFFSET macro). This macro is intended to be used in + * combination with RESTORE_REGS_FROM_STACK macro. Example: + * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1 + * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1 + */ +.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \ + r2 = 0, r3 = 0, r4 = 0, \ + r5 = 0, r6 = 0, r7 = 0, \ + r8 = 0, r9 = 0, r10 = 0, \ + r11 = 0, r12 = 0, r13 = 0, \ + r14 = 0 + .if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4) + .error "Stack offset must be pozitive and multiple of 4." + .endif + .if \stack_offset != 0 + addiu sp, sp, -\stack_offset + .endif + sw \r1, 0(sp) + .if \r2 != 0 + sw \r2, 4(sp) + .endif + .if \r3 != 0 + sw \r3, 8(sp) + .endif + .if \r4 != 0 + sw \r4, 12(sp) + .endif + .if \r5 != 0 + CHECK_STACK_OFFSET 5, \stack_offset + sw \r5, 16(sp) + .endif + .if \r6 != 0 + CHECK_STACK_OFFSET 6, \stack_offset + sw \r6, 20(sp) + .endif + .if \r7 != 0 + CHECK_STACK_OFFSET 7, \stack_offset + sw \r7, 24(sp) + .endif + .if \r8 != 0 + CHECK_STACK_OFFSET 8, \stack_offset + sw \r8, 28(sp) + .endif + .if \r9 != 0 + CHECK_STACK_OFFSET 9, \stack_offset + sw \r9, 32(sp) + .endif + .if \r10 != 0 + CHECK_STACK_OFFSET 10, \stack_offset + sw \r10, 36(sp) + .endif + .if \r11 != 0 + CHECK_STACK_OFFSET 11, \stack_offset + sw \r11, 40(sp) + .endif + .if \r12 != 0 + CHECK_STACK_OFFSET 12, \stack_offset + sw \r12, 44(sp) + .endif + .if \r13 != 0 + CHECK_STACK_OFFSET 13, \stack_offset + sw \r13, 48(sp) + .endif + .if \r14 != 0 + CHECK_STACK_OFFSET 14, \stack_offset + sw \r14, 52(sp) + .endif +.endm + +/* + * Restores set of registers from stack. Maximum number of registers that + * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7). + * Stack offset is number of bytes that are added to stack pointer (sp) + * after registers are restored (offset must be multiple of 4, and must + * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is + * intended to be used in combination with RESTORE_REGS_FROM_STACK macro. + * Example: + * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1 + * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1 + */ +.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \ + r2 = 0, r3 = 0, r4 = 0, \ + r5 = 0, r6 = 0, r7 = 0, \ + r8 = 0, r9 = 0, r10 = 0, \ + r11 = 0, r12 = 0, r13 = 0, \ + r14 = 0 + .if (\stack_offset < 0) || (\stack_offset - (\stack_offset/4)*4) + .error "Stack offset must be pozitive and multiple of 4." + .endif + lw \r1, 0(sp) + .if \r2 != 0 + lw \r2, 4(sp) + .endif + .if \r3 != 0 + lw \r3, 8(sp) + .endif + .if \r4 != 0 + lw \r4, 12(sp) + .endif + .if \r5 != 0 + CHECK_STACK_OFFSET 5, \stack_offset + lw \r5, 16(sp) + .endif + .if \r6 != 0 + CHECK_STACK_OFFSET 6, \stack_offset + lw \r6, 20(sp) + .endif + .if \r7 != 0 + CHECK_STACK_OFFSET 7, \stack_offset + lw \r7, 24(sp) + .endif + .if \r8 != 0 + CHECK_STACK_OFFSET 8, \stack_offset + lw \r8, 28(sp) + .endif + .if \r9 != 0 + CHECK_STACK_OFFSET 9, \stack_offset + lw \r9, 32(sp) + .endif + .if \r10 != 0 + CHECK_STACK_OFFSET 10, \stack_offset + lw \r10, 36(sp) + .endif + .if \r11 != 0 + CHECK_STACK_OFFSET 11, \stack_offset + lw \r11, 40(sp) + .endif + .if \r12 != 0 + CHECK_STACK_OFFSET 12, \stack_offset + lw \r12, 44(sp) + .endif + .if \r13 != 0 + CHECK_STACK_OFFSET 13, \stack_offset + lw \r13, 48(sp) + .endif + .if \r14 != 0 + CHECK_STACK_OFFSET 14, \stack_offset + lw \r14, 52(sp) + .endif + .if \stack_offset != 0 + addiu sp, sp, \stack_offset + .endif +.endm + + diff --git a/media/libjpeg/simd/jsimd_powerpc.c b/media/libjpeg/simd/jsimd_powerpc.c new file mode 100644 index 000000000..42dc1e086 --- /dev/null +++ b/media/libjpeg/simd/jsimd_powerpc.c @@ -0,0 +1,828 @@ +/* + * jsimd_powerpc.c + * + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2009-2011, 2014-2016, D. R. Commander. + * Copyright (C) 2015, Matthieu Darbois. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * PowerPC architecture. + */ + +#define JPEG_INTERNALS +#include "../jinclude.h" +#include "../jpeglib.h" +#include "../jsimd.h" +#include "../jdct.h" +#include "../jsimddct.h" +#include "jsimd.h" + +#include <stdio.h> +#include <string.h> +#include <ctype.h> + +static unsigned int simd_support = ~0; + +#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + +#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024) + +LOCAL(int) +check_feature (char *buffer, char *feature) +{ + char *p; + if (*feature == 0) + return 0; + if (strncmp(buffer, "cpu", 3) != 0) + return 0; + buffer += 3; + while (isspace(*buffer)) + buffer++; + + /* Check if 'feature' is present in the buffer as a separate word */ + while ((p = strstr(buffer, feature))) { + if (p > buffer && !isspace(*(p - 1))) { + buffer++; + continue; + } + p += strlen(feature); + if (*p != 0 && !isspace(*p)) { + buffer++; + continue; + } + return 1; + } + return 0; +} + +LOCAL(int) +parse_proc_cpuinfo (int bufsize) +{ + char *buffer = (char *)malloc(bufsize); + FILE *fd; + simd_support = 0; + + if (!buffer) + return 0; + + fd = fopen("/proc/cpuinfo", "r"); + if (fd) { + while (fgets(buffer, bufsize, fd)) { + if (!strchr(buffer, '\n') && !feof(fd)) { + /* "impossible" happened - insufficient size of the buffer! */ + fclose(fd); + free(buffer); + return 0; + } + if (check_feature(buffer, "altivec")) + simd_support |= JSIMD_ALTIVEC; + } + fclose(fd); + } + free(buffer); + return 1; +} + +#endif + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ +LOCAL(void) +init_simd (void) +{ + char *env = NULL; +#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)) + int bufsize = 1024; /* an initial guess for the line buffer size limit */ +#endif + + if (simd_support != ~0U) + return; + + simd_support = 0; + +#if defined(__ALTIVEC__) || defined(__APPLE__) + simd_support |= JSIMD_ALTIVEC; +#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + while (!parse_proc_cpuinfo(bufsize)) { + bufsize *= 2; + if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT) + break; + } +#endif + + /* Force different settings through environment variables */ + env = getenv("JSIMD_FORCEALTIVEC"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = JSIMD_ALTIVEC; + env = getenv("JSIMD_FORCENONE"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = 0; +} + +GLOBAL(int) +jsimd_can_rgb_ycc (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565 (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + void (*altivecfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch(cinfo->in_color_space) { + case JCS_EXT_RGB: + altivecfct=jsimd_extrgb_ycc_convert_altivec; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + altivecfct=jsimd_extrgbx_ycc_convert_altivec; + break; + case JCS_EXT_BGR: + altivecfct=jsimd_extbgr_ycc_convert_altivec; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + altivecfct=jsimd_extbgrx_ycc_convert_altivec; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + altivecfct=jsimd_extxbgr_ycc_convert_altivec; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + altivecfct=jsimd_extxrgb_ycc_convert_altivec; + break; + default: + altivecfct=jsimd_rgb_ycc_convert_altivec; + break; + } + + altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + void (*altivecfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch(cinfo->in_color_space) { + case JCS_EXT_RGB: + altivecfct=jsimd_extrgb_gray_convert_altivec; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + altivecfct=jsimd_extrgbx_gray_convert_altivec; + break; + case JCS_EXT_BGR: + altivecfct=jsimd_extbgr_gray_convert_altivec; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + altivecfct=jsimd_extbgrx_gray_convert_altivec; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + altivecfct=jsimd_extxbgr_gray_convert_altivec; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + altivecfct=jsimd_extxrgb_gray_convert_altivec; + break; + default: + altivecfct=jsimd_rgb_gray_convert_altivec; + break; + } + + altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch(cinfo->out_color_space) { + case JCS_EXT_RGB: + altivecfct=jsimd_ycc_extrgb_convert_altivec; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + altivecfct=jsimd_ycc_extrgbx_convert_altivec; + break; + case JCS_EXT_BGR: + altivecfct=jsimd_ycc_extbgr_convert_altivec; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + altivecfct=jsimd_ycc_extbgrx_convert_altivec; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + altivecfct=jsimd_ycc_extxbgr_convert_altivec; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + altivecfct=jsimd_ycc_extxrgb_convert_altivec; + break; + default: + altivecfct=jsimd_ycc_rgb_convert_altivec; + break; + } + + altivecfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_downsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v2_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(void) +jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v1_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, + compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(int) +jsimd_can_h2v2_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v2_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v1_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v2_fancy_upsample_altivec(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v1_fancy_upsample_altivec(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch(cinfo->out_color_space) { + case JCS_EXT_RGB: + altivecfct=jsimd_h2v2_extrgb_merged_upsample_altivec; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + altivecfct=jsimd_h2v2_extrgbx_merged_upsample_altivec; + break; + case JCS_EXT_BGR: + altivecfct=jsimd_h2v2_extbgr_merged_upsample_altivec; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + altivecfct=jsimd_h2v2_extbgrx_merged_upsample_altivec; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + altivecfct=jsimd_h2v2_extxbgr_merged_upsample_altivec; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + altivecfct=jsimd_h2v2_extxrgb_merged_upsample_altivec; + break; + default: + altivecfct=jsimd_h2v2_merged_upsample_altivec; + break; + } + + altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch(cinfo->out_color_space) { + case JCS_EXT_RGB: + altivecfct=jsimd_h2v1_extrgb_merged_upsample_altivec; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + altivecfct=jsimd_h2v1_extrgbx_merged_upsample_altivec; + break; + case JCS_EXT_BGR: + altivecfct=jsimd_h2v1_extbgr_merged_upsample_altivec; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + altivecfct=jsimd_h2v1_extbgrx_merged_upsample_altivec; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + altivecfct=jsimd_h2v1_extxbgr_merged_upsample_altivec; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + altivecfct=jsimd_h2v1_extxrgb_merged_upsample_altivec; + break; + default: + altivecfct=jsimd_h2v1_merged_upsample_altivec; + break; + } + + altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(int) +jsimd_can_convsamp (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + jsimd_convsamp_altivec(sample_data, start_col, workspace); +} + +GLOBAL(void) +jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_fdct_islow (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow (DCTELEM *data) +{ + jsimd_fdct_islow_altivec(data); +} + +GLOBAL(void) +jsimd_fdct_ifast (DCTELEM *data) +{ + jsimd_fdct_ifast_altivec(data); +} + +GLOBAL(void) +jsimd_fdct_float (FAST_FLOAT *data) +{ +} + +GLOBAL(int) +jsimd_can_quantize (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors, + DCTELEM *workspace) +{ + jsimd_quantize_altivec(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ +} + +GLOBAL(int) +jsimd_can_idct_2x2 (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4 (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(void) +jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_idct_islow (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block (void) +{ + return 0; +} + +GLOBAL(JOCTET*) +jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + return NULL; +} diff --git a/media/libjpeg/simd/jsimd_x86_64.c b/media/libjpeg/simd/jsimd_x86_64.c new file mode 100644 index 000000000..a62bcdb0e --- /dev/null +++ b/media/libjpeg/simd/jsimd_x86_64.c @@ -0,0 +1,887 @@ +/* + * jsimd_x86_64.c + * + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright (C) 2009-2011, 2014, 2016, D. R. Commander. + * Copyright (C) 2015, Matthieu Darbois. + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on a + * 64-bit x86 architecture. + */ + +#define JPEG_INTERNALS +#include "../jinclude.h" +#include "../jpeglib.h" +#include "../jsimd.h" +#include "../jdct.h" +#include "../jsimddct.h" +#include "jsimd.h" + +/* + * In the PIC cases, we have no guarantee that constants will keep + * their alignment. This macro allows us to verify it at runtime. + */ +#define IS_ALIGNED(ptr, order) (((size_t)ptr & ((1 << order) - 1)) == 0) + +#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */ + +static unsigned int simd_support = ~0; +static unsigned int simd_huffman = 1; + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ +LOCAL(void) +init_simd (void) +{ + char *env = NULL; + + if (simd_support != ~0U) + return; + + simd_support = JSIMD_SSE2 | JSIMD_SSE; + + /* Force different settings through environment variables */ + env = getenv("JSIMD_FORCENONE"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = 0; + env = getenv("JSIMD_NOHUFFENC"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_huffman = 0; +} + +GLOBAL(int) +jsimd_can_rgb_ycc (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb565 (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch(cinfo->in_color_space) { + case JCS_EXT_RGB: + sse2fct=jsimd_extrgb_ycc_convert_sse2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + sse2fct=jsimd_extrgbx_ycc_convert_sse2; + break; + case JCS_EXT_BGR: + sse2fct=jsimd_extbgr_ycc_convert_sse2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + sse2fct=jsimd_extbgrx_ycc_convert_sse2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + sse2fct=jsimd_extxbgr_ycc_convert_sse2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + sse2fct=jsimd_extxrgb_ycc_convert_sse2; + break; + default: + sse2fct=jsimd_rgb_ycc_convert_sse2; + break; + } + + sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch(cinfo->in_color_space) { + case JCS_EXT_RGB: + sse2fct=jsimd_extrgb_gray_convert_sse2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + sse2fct=jsimd_extrgbx_gray_convert_sse2; + break; + case JCS_EXT_BGR: + sse2fct=jsimd_extbgr_gray_convert_sse2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + sse2fct=jsimd_extbgrx_gray_convert_sse2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + sse2fct=jsimd_extxbgr_gray_convert_sse2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + sse2fct=jsimd_extxrgb_gray_convert_sse2; + break; + default: + sse2fct=jsimd_rgb_gray_convert_sse2; + break; + } + + sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch(cinfo->out_color_space) { + case JCS_EXT_RGB: + sse2fct=jsimd_ycc_extrgb_convert_sse2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + sse2fct=jsimd_ycc_extrgbx_convert_sse2; + break; + case JCS_EXT_BGR: + sse2fct=jsimd_ycc_extbgr_convert_sse2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + sse2fct=jsimd_ycc_extbgrx_convert_sse2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + sse2fct=jsimd_ycc_extxbgr_convert_sse2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + sse2fct=jsimd_ycc_extxrgb_convert_sse2; + break; + default: + sse2fct=jsimd_ycc_rgb_convert_sse2; + break; + } + + sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); +} + +GLOBAL(void) +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_downsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(void) +jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ + jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +} + +GLOBAL(int) +jsimd_can_h2v2_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_fancy_upsample_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_fancy_upsample_sse2)) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info *compptr, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_merged_upsample_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if ((simd_support & JSIMD_SSE2) && + IS_ALIGNED_SSE(jconst_merged_upsample_sse2)) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch(cinfo->out_color_space) { + case JCS_EXT_RGB: + sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + sse2fct=jsimd_h2v2_extrgbx_merged_upsample_sse2; + break; + case JCS_EXT_BGR: + sse2fct=jsimd_h2v2_extbgr_merged_upsample_sse2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + sse2fct=jsimd_h2v2_extbgrx_merged_upsample_sse2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + sse2fct=jsimd_h2v2_extxbgr_merged_upsample_sse2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + sse2fct=jsimd_h2v2_extxrgb_merged_upsample_sse2; + break; + default: + sse2fct=jsimd_h2v2_merged_upsample_sse2; + break; + } + + sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch(cinfo->out_color_space) { + case JCS_EXT_RGB: + sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + sse2fct=jsimd_h2v1_extrgbx_merged_upsample_sse2; + break; + case JCS_EXT_BGR: + sse2fct=jsimd_h2v1_extbgr_merged_upsample_sse2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + sse2fct=jsimd_h2v1_extbgrx_merged_upsample_sse2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + sse2fct=jsimd_h2v1_extxbgr_merged_upsample_sse2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + sse2fct=jsimd_h2v1_extxrgb_merged_upsample_sse2; + break; + default: + sse2fct=jsimd_h2v1_merged_upsample_sse2; + break; + } + + sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); +} + +GLOBAL(int) +jsimd_can_convsamp (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM *workspace) +{ + jsimd_convsamp_sse2(sample_data, start_col, workspace); +} + +GLOBAL(void) +jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace) +{ + jsimd_convsamp_float_sse2(sample_data, start_col, workspace); +} + +GLOBAL(int) +jsimd_can_fdct_islow (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + + if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse)) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow (DCTELEM *data) +{ + jsimd_fdct_islow_sse2(data); +} + +GLOBAL(void) +jsimd_fdct_ifast (DCTELEM *data) +{ + jsimd_fdct_ifast_sse2(data); +} + +GLOBAL(void) +jsimd_fdct_float (FAST_FLOAT *data) +{ + jsimd_fdct_float_sse(data); +} + +GLOBAL(int) +jsimd_can_quantize (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + + if (simd_support & JSIMD_SSE2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors, + DCTELEM *workspace) +{ + jsimd_quantize_sse2(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, + FAST_FLOAT *workspace) +{ + jsimd_quantize_float_sse2(coef_block, divisors, workspace); +} + +GLOBAL(int) +jsimd_can_idct_2x2 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(void) +jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(int) +jsimd_can_idct_islow (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(IFAST_MULT_TYPE) != 2) + return 0; + if (IFAST_SCALE_BITS != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float (void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + if (sizeof(FLOAT_MULT_TYPE) != 4) + return 0; + + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2)) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(void) +jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf, + output_col); +} + +GLOBAL(int) +jsimd_can_huff_encode_one_block (void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + + if ((simd_support & JSIMD_SSE2) && simd_huffman && + IS_ALIGNED_SSE(jconst_huff_encode_one_block)) + return 1; + + return 0; +} + +GLOBAL(JOCTET*) +jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block, + int last_dc_val, c_derived_tbl *dctbl, + c_derived_tbl *actbl) +{ + return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val, + dctbl, actbl); +} diff --git a/media/libjpeg/simd/jsimdcfg.inc b/media/libjpeg/simd/jsimdcfg.inc new file mode 100755 index 000000000..9d4aedec9 --- /dev/null +++ b/media/libjpeg/simd/jsimdcfg.inc @@ -0,0 +1,94 @@ +; +; Automatically generated include file from jsimdcfg.inc.h +; +; +; -- jpeglib.h +; +%define DCTSIZE 8 +%define DCTSIZE2 64 +; +; -- jmorecfg.h +; +%define RGB_RED 0 +%define RGB_GREEN 1 +%define RGB_BLUE 2 +%define RGB_PIXELSIZE 3 +%define EXT_RGB_RED 0 +%define EXT_RGB_GREEN 1 +%define EXT_RGB_BLUE 2 +%define EXT_RGB_PIXELSIZE 3 +%define EXT_RGBX_RED 0 +%define EXT_RGBX_GREEN 1 +%define EXT_RGBX_BLUE 2 +%define EXT_RGBX_PIXELSIZE 4 +%define EXT_BGR_RED 2 +%define EXT_BGR_GREEN 1 +%define EXT_BGR_BLUE 0 +%define EXT_BGR_PIXELSIZE 3 +%define EXT_BGRX_RED 2 +%define EXT_BGRX_GREEN 1 +%define EXT_BGRX_BLUE 0 +%define EXT_BGRX_PIXELSIZE 4 +%define EXT_XBGR_RED 3 +%define EXT_XBGR_GREEN 2 +%define EXT_XBGR_BLUE 1 +%define EXT_XBGR_PIXELSIZE 4 +%define EXT_XRGB_RED 1 +%define EXT_XRGB_GREEN 2 +%define EXT_XRGB_BLUE 3 +%define EXT_XRGB_PIXELSIZE 4 +%define RGBX_FILLER_0XFF 1 +; Representation of a single sample (pixel element value). +; On this SIMD implementation, this must be 'unsigned char'. +; +%define JSAMPLE byte ; unsigned char +%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE) +%define CENTERJSAMPLE 128 +; Representation of a DCT frequency coefficient. +; On this SIMD implementation, this must be 'short'. +; +%define JCOEF word ; short +%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF) +; Datatype used for image dimensions. +; On this SIMD implementation, this must be 'unsigned int'. +; +%define JDIMENSION dword ; unsigned int +%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION) +%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h) +%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h) +%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h) +%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h) +%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW) +%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY) +%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE) +%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR) +; +; -- jdct.h +; +; A forward DCT routine is given a pointer to a work area of type DCTELEM[]; +; the DCT is to be performed in-place in that buffer. +; To maximize parallelism, Type DCTELEM is changed to short (originally, int). +; +%define DCTELEM word ; short +%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM) +%define float FP32 ; float +%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(float) +; To maximize parallelism, Type short is changed to short. +; +%define ISLOW_MULT_TYPE word ; must be short +%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE) +%define IFAST_MULT_TYPE word ; must be short +%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE) +%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors +%define FLOAT_MULT_TYPE FP32 ; must be float +%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE) +; +; -- jsimd.h +; +%define JSIMD_NONE 0x00 +%define JSIMD_MMX 0x01 +%define JSIMD_3DNOW 0x02 +%define JSIMD_SSE 0x04 +%define JSIMD_SSE2 0x08 +; Short forms of external names for systems with brain-damaged linkers. +; diff --git a/media/libjpeg/simd/jsimdcpu.asm b/media/libjpeg/simd/jsimdcpu.asm new file mode 100644 index 000000000..599083b18 --- /dev/null +++ b/media/libjpeg/simd/jsimdcpu.asm @@ -0,0 +1,104 @@ +; +; jsimdcpu.asm - SIMD instruction support check +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 +; +; Check if the CPU supports SIMD instructions +; +; GLOBAL(unsigned int) +; jpeg_simd_cpu_support (void) +; + + align 16 + global EXTN(jpeg_simd_cpu_support) + +EXTN(jpeg_simd_cpu_support): + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused + push edi + + xor edi,edi ; simd support flag + + pushfd + pop eax + mov edx,eax + xor eax, 1<<21 ; flip ID bit in EFLAGS + push eax + popfd + pushfd + pop eax + xor eax,edx + jz short .return ; CPUID is not supported + + ; Check for MMX instruction support + xor eax,eax + cpuid + test eax,eax + jz short .return + + xor eax,eax + inc eax + cpuid + mov eax,edx ; eax = Standard feature flags + + test eax, 1<<23 ; bit23:MMX + jz short .no_mmx + or edi, byte JSIMD_MMX +.no_mmx: + test eax, 1<<25 ; bit25:SSE + jz short .no_sse + or edi, byte JSIMD_SSE +.no_sse: + test eax, 1<<26 ; bit26:SSE2 + jz short .no_sse2 + or edi, byte JSIMD_SSE2 +.no_sse2: + + ; Check for 3DNow! instruction support + mov eax, 0x80000000 + cpuid + cmp eax, 0x80000000 + jbe short .return + + mov eax, 0x80000001 + cpuid + mov eax,edx ; eax = Extended feature flags + + test eax, 1<<31 ; bit31:3DNow!(vendor independent) + jz short .no_3dnow + or edi, byte JSIMD_3DNOW +.no_3dnow: + +.return: + mov eax,edi + + pop edi +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/media/libjpeg/simd/jsimdext.inc b/media/libjpeg/simd/jsimdext.inc new file mode 100644 index 000000000..f28db60b5 --- /dev/null +++ b/media/libjpeg/simd/jsimdext.inc @@ -0,0 +1,375 @@ +; +; jsimdext.inc - common declarations +; +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB +; Copyright (C) 2010, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library - version 1.02 +; +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; +; This software is provided 'as-is', without any express or implied +; warranty. In no event will the authors be held liable for any damages +; arising from the use of this software. +; +; Permission is granted to anyone to use this software for any purpose, +; including commercial applications, and to alter it and redistribute it +; freely, subject to the following restrictions: +; +; 1. The origin of this software must not be misrepresented; you must not +; claim that you wrote the original software. If you use this software +; in a product, an acknowledgment in the product documentation would be +; appreciated but is not required. +; 2. Altered source versions must be plainly marked as such, and must not be +; misrepresented as being the original software. +; 3. This notice may not be removed or altered from any source distribution. +; +; [TAB8] + +; ========================================================================== +; System-dependent configurations + +%ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)-------- +; * Microsoft Visual C++ +; * MinGW (Minimalist GNU for Windows) +; * CygWin +; * LCC-Win32 + +; -- segment definition -- +; +%ifdef __YASM_VER__ +%define SEG_TEXT .text align=16 +%define SEG_CONST .rdata align=16 +%else +%define SEG_TEXT .text align=16 public use32 class=CODE +%define SEG_CONST .rdata align=16 public use32 class=CONST +%endif + +%elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)-------- +; * Microsoft Visual C++ + +; -- segment definition -- +; +%ifdef __YASM_VER__ +%define SEG_TEXT .text align=16 +%define SEG_CONST .rdata align=16 +%else +%define SEG_TEXT .text align=16 public use64 class=CODE +%define SEG_CONST .rdata align=16 public use64 class=CONST +%endif +%define EXTN(name) name ; foo() -> foo + +%elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)---------- +; * Borland C++ (Win32) + +; -- segment definition -- +; +%define SEG_TEXT _text align=16 public use32 class=CODE +%define SEG_CONST _data align=16 public use32 class=DATA + +%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------ +; * Linux +; * *BSD family Unix using elf format +; * Unix System V, including Solaris x86, UnixWare and SCO Unix + +; mark stack as non-executable +section .note.GNU-stack noalloc noexec nowrite progbits + +; -- segment definition -- +; +%ifdef __x86_64__ +%define SEG_TEXT .text progbits align=16 +%define SEG_CONST .rodata progbits align=16 +%else +%define SEG_TEXT .text progbits alloc exec nowrite align=16 +%define SEG_CONST .rodata progbits alloc noexec nowrite align=16 +%endif + +; To make the code position-independent, append -DPIC to the commandline +; +%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC +%define EXTN(name) name ; foo() -> foo + +%elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)---- +; * Older Linux using a.out format (nasm -f aout -DAOUT ...) +; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...) + +; -- segment definition -- +; +%define SEG_TEXT .text +%define SEG_CONST .data + +; To make the code position-independent, append -DPIC to the commandline +; +%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC + +%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- +; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format) + +; -- segment definition -- +; +%define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why? +%define SEG_CONST .rodata align=16 + +; The generation of position-independent code (PIC) is the default on Darwin. +; +%define PIC +%define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing + +%else ; ----(Other case)---------------------- + +; -- segment definition -- +; +%define SEG_TEXT .text +%define SEG_CONST .data + +%endif ; ---------------------------------------------- + +; ========================================================================== + +; -------------------------------------------------------------------------- +; Common types +; +%ifdef __x86_64__ +%define POINTER qword ; general pointer type +%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER) +%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT +%else +%define POINTER dword ; general pointer type +%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER) +%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT +%endif + +%define INT dword ; signed integer type +%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT) +%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT + +%define FP32 dword ; IEEE754 single +%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32) +%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT + +%define MMWORD qword ; int64 (MMX register) +%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD) +%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT + +; NASM is buggy and doesn't properly handle operand sizes for SSE +; instructions, so for now we have to define XMMWORD as blank. +%define XMMWORD ; int128 (SSE register) +%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD) +%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT + +; Similar hacks for when we load a dword or MMWORD into an xmm# register +%define XMM_DWORD +%define XMM_MMWORD + +%define SIZEOF_BYTE 1 ; sizeof(BYTE) +%define SIZEOF_WORD 2 ; sizeof(WORD) +%define SIZEOF_DWORD 4 ; sizeof(DWORD) +%define SIZEOF_QWORD 8 ; sizeof(QWORD) +%define SIZEOF_OWORD 16 ; sizeof(OWORD) + +%define BYTE_BIT 8 ; CHAR_BIT in C +%define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT +%define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT +%define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT +%define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT + +; -------------------------------------------------------------------------- +; External Symbol Name +; +%ifndef EXTN +%define EXTN(name) _ %+ name ; foo() -> _foo +%endif + +; -------------------------------------------------------------------------- +; Macros for position-independent code (PIC) support +; +%ifndef GOT_SYMBOL +%undef PIC +%endif + +%ifdef PIC ; ------------------------------------------- + +%ifidn GOT_SYMBOL,_MACHO_PIC_ ; -------------------- + +; At present, nasm doesn't seem to support PIC generation for Mach-O. +; The PIC support code below is a little tricky. + + SECTION SEG_CONST +const_base: + +%define GOTOFF(got,sym) (got) + (sym) - const_base + +%imacro get_GOT 1 + ; NOTE: this macro destroys ecx resister. + call %%geteip + add ecx, byte (%%ref - $) + jmp short %%adjust +%%geteip: + mov ecx, POINTER [esp] + ret +%%adjust: + push ebp + xor ebp,ebp ; ebp = 0 +%ifidni %1,ebx ; (%1 == ebx) + ; db 0x8D,0x9C + jmp near const_base = + ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32) + db 0x8D,0x9C ; 8D,9C + jmp near const_base ; E9,(const_base-%%ref) +%%ref: +%else ; (%1 != ebx) + ; db 0x8D,0x8C + jmp near const_base = + ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32) + db 0x8D,0x8C ; 8D,8C + jmp near const_base ; E9,(const_base-%%ref) +%%ref: mov %1, ecx +%endif ; (%1 == ebx) + pop ebp +%endmacro + +%else ; GOT_SYMBOL != _MACHO_PIC_ ---------------- + +%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff + +%imacro get_GOT 1 + extern GOT_SYMBOL + call %%geteip + add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc + jmp short %%done +%%geteip: + mov %1, POINTER [esp] + ret +%%done: +%endmacro + +%endif ; GOT_SYMBOL == _MACHO_PIC_ ---------------- + +%imacro pushpic 1.nolist + push %1 +%endmacro +%imacro poppic 1.nolist + pop %1 +%endmacro +%imacro movpic 2.nolist + mov %1,%2 +%endmacro + +%else ; !PIC ----------------------------------------- + +%define GOTOFF(got,sym) (sym) + +%imacro get_GOT 1.nolist +%endmacro +%imacro pushpic 1.nolist +%endmacro +%imacro poppic 1.nolist +%endmacro +%imacro movpic 2.nolist +%endmacro + +%endif ; PIC ----------------------------------------- + +; -------------------------------------------------------------------------- +; Align the next instruction on {2,4,8,16,..}-byte boundary. +; ".balign n,,m" in GNU as +; +%define MSKLE(x,y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16) +%define FILLB(b,n) (($$-(b)) & ((n)-1)) + +%imacro alignx 1-2.nolist 0xFFFF +%%bs: times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \ + db 0x90 ; nop + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \ + db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000] + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \ + db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \ + db 0x8D,0xAD,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \ + db 0x8D,0x6C,0x25,0x00 ; lea ebp,[ebp+0x00] + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \ + db 0x8D,0x6D,0x00 ; lea ebp,[ebp+0x00] + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \ + db 0x8B,0xED ; mov ebp,ebp + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \ + db 0x90 ; nop +%endmacro + +; Align the next data on {2,4,8,16,..}-byte boundary. +; +%imacro alignz 1.nolist + align %1, db 0 ; filling zeros +%endmacro + +%ifdef __x86_64__ + +%ifdef WIN64 + +%imacro collect_args 0 + push r12 + push r13 + push r14 + push r15 + mov r10, rcx + mov r11, rdx + mov r12, r8 + mov r13, r9 + mov r14, [rax+48] + mov r15, [rax+56] + push rsi + push rdi + sub rsp, SIZEOF_XMMWORD + movaps XMMWORD [rsp], xmm6 + sub rsp, SIZEOF_XMMWORD + movaps XMMWORD [rsp], xmm7 +%endmacro + +%imacro uncollect_args 0 + movaps xmm7, XMMWORD [rsp] + add rsp, SIZEOF_XMMWORD + movaps xmm6, XMMWORD [rsp] + add rsp, SIZEOF_XMMWORD + pop rdi + pop rsi + pop r15 + pop r14 + pop r13 + pop r12 +%endmacro + +%else + +%imacro collect_args 0 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + mov r10, rdi + mov r11, rsi + mov r12, rdx + mov r13, rcx + mov r14, r8 + mov r15, r9 +%endmacro + +%imacro uncollect_args 0 + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 +%endmacro + +%endif + +%endif + +; -------------------------------------------------------------------------- +; Defines picked up from the C headers +; +%include "jsimdcfg.inc" + +; -------------------------------------------------------------------------- |