#ifndef CUFFTDX_FFT_1024_FP32_FWD_PTX_HPP
#define CUFFTDX_FFT_1024_FP32_FWD_PTX_HPP



template<> __forceinline__ __device__ void cufftdx_private_function<83, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<1872>;
.reg .b32 r<24>;
.reg .b64 rd<10>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 13;
mov.u32 r3, %64;
add.s32 r4, r3, r2;
add.f32 f129, %66, %98;
sub.f32 f131, %66, %98;
add.f32 f1867, %67, %130;
sub.f32 f132, %67, %130;
add.f32 f133, %82, %114;
sub.f32 f135, %82, %114;
add.f32 f1865, %131, %115;
sub.f32 f136, %131, %115;
add.f32 f137, f129, f133;
sub.f32 f139, f129, f133;
add.f32 f1864, f1867, f1865;
sub.f32 f140, f1867, f1865;
add.f32 f141, f131, f136;
sub.f32 f143, f131, f136;
sub.f32 f1863, f132, f135;
add.f32 f144, f132, f135;
add.f32 f145, %74, %106;
sub.f32 f147, %74, %106;
add.f32 f1860, %132, %133;
sub.f32 f148, %132, %133;
add.f32 f149, %90, %122;
sub.f32 f151, %90, %122;
add.f32 f1858, %91, %134;
sub.f32 f152, %91, %134;
add.f32 f153, f145, f149;
sub.f32 f155, f145, f149;
add.f32 f1857, f1860, f1858;
sub.f32 f156, f1860, f1858;
add.f32 f157, f147, f152;
sub.f32 f159, f147, f152;
sub.f32 f1856, f148, f151;
add.f32 f160, f148, f151;
mul.f32 f162, f1856, 0fBF3504F3;
mul.f32 f1855, f157, 0f3F3504F3;
sub.f32 f163, f1855, f162;
mul.f32 f164, f1856, 0f3F3504F3;
fma.rn.f32 f165, f157, 0fBF3504F3, f164;
mul.f32 f166, f159, 0fBF3504F3;
mul.f32 f167, f160, 0fBF3504F3;
sub.f32 f168, f166, f167;
add.f32 f169, f166, f167;
add.f32 f170, f137, f153;
sub.f32 f172, f137, f153;
add.f32 f1854, f1864, f1857;
sub.f32 f173, f1864, f1857;
add.f32 f174, f141, f163;
sub.f32 f176, f141, f163;
add.f32 f1853, f1863, f165;
sub.f32 f177, f1863, f165;
add.f32 f178, f139, f156;
sub.f32 f180, f139, f156;
sub.f32 f1852, f140, f155;
add.f32 f181, f140, f155;
add.f32 f182, f143, f168;
sub.f32 f184, f143, f168;
add.f32 f1851, f144, f169;
sub.f32 f185, f144, f169;
add.f32 f186, %70, %102;
sub.f32 f188, %70, %102;
add.f32 f1849, %135, %103;
sub.f32 f189, %135, %103;
add.f32 f190, %86, %118;
sub.f32 f192, %86, %118;
add.f32 f1846, %137, %136;
sub.f32 f193, %137, %136;
add.f32 f194, f186, f190;
sub.f32 f196, f186, f190;
add.f32 f1845, f1849, f1846;
sub.f32 f197, f1849, f1846;
add.f32 f198, f188, f193;
sub.f32 f200, f188, f193;
sub.f32 f1844, f189, f192;
add.f32 f201, f189, f192;
add.f32 f202, %78, %110;
sub.f32 f204, %78, %110;
add.f32 f1842, %79, %138;
sub.f32 f205, %79, %138;
add.f32 f206, %94, %126;
sub.f32 f208, %94, %126;
add.f32 f1840, %139, %127;
sub.f32 f209, %139, %127;
add.f32 f210, f202, f206;
sub.f32 f212, f202, f206;
add.f32 f1839, f1842, f1840;
sub.f32 f213, f1842, f1840;
add.f32 f214, f204, f209;
sub.f32 f216, f204, f209;
sub.f32 f1838, f205, f208;
add.f32 f217, f205, f208;
mul.f32 f219, f1838, 0fBF3504F3;
mul.f32 f1837, f214, 0f3F3504F3;
sub.f32 f220, f1837, f219;
mul.f32 f221, f1838, 0f3F3504F3;
fma.rn.f32 f222, f214, 0fBF3504F3, f221;
mul.f32 f223, f216, 0fBF3504F3;
mul.f32 f224, f217, 0fBF3504F3;
sub.f32 f225, f223, f224;
add.f32 f226, f223, f224;
add.f32 f227, f194, f210;
sub.f32 f229, f194, f210;
add.f32 f1836, f1845, f1839;
sub.f32 f230, f1845, f1839;
add.f32 f231, f198, f220;
sub.f32 f233, f198, f220;
add.f32 f1835, f1844, f222;
sub.f32 f234, f1844, f222;
add.f32 f235, f196, f213;
sub.f32 f237, f196, f213;
sub.f32 f1834, f197, f212;
add.f32 f238, f197, f212;
add.f32 f239, f200, f225;
sub.f32 f241, f200, f225;
add.f32 f1833, f201, f226;
sub.f32 f242, f201, f226;
mul.f32 f1831, f231, 0f3F6C835E;
mul.f32 f1832, f1835, 0fBEC3EF15;
sub.f32 f245, f1831, f1832;
mul.f32 f246, f1835, 0f3F6C835E;
fma.rn.f32 f247, f231, 0fBEC3EF15, f246;
mul.f32 f1829, f235, 0f3F3504F3;
mul.f32 f1830, f1834, 0fBF3504F3;
sub.f32 f250, f1829, f1830;
mul.f32 f251, f1834, 0f3F3504F3;
fma.rn.f32 f252, f235, 0fBF3504F3, f251;
mul.f32 f1827, f239, 0f3EC3EF15;
mul.f32 f1828, f1833, 0fBF6C835E;
sub.f32 f255, f1827, f1828;
mul.f32 f256, f1833, 0f3EC3EF15;
fma.rn.f32 f257, f239, 0fBF6C835E, f256;
mul.f32 f1825, f233, 0fBEC3EF15;
mul.f32 f1826, f234, 0fBF6C835E;
sub.f32 f260, f1825, f1826;
mul.f32 f261, f234, 0fBEC3EF15;
fma.rn.f32 f262, f233, 0fBF6C835E, f261;
mul.f32 f263, f237, 0fBF3504F3;
mul.f32 f264, f238, 0fBF3504F3;
sub.f32 f265, f263, f264;
add.f32 f266, f263, f264;
mul.f32 f1823, f241, 0fBF6C835E;
mul.f32 f1824, f242, 0fBEC3EF15;
sub.f32 f269, f1823, f1824;
mul.f32 f270, f242, 0fBF6C835E;
fma.rn.f32 f271, f241, 0fBEC3EF15, f270;
add.f32 f272, f170, f227;
sub.f32 f274, f170, f227;
add.f32 f1822, f1854, f1836;
sub.f32 f275, f1854, f1836;
add.f32 f276, f174, f245;
sub.f32 f278, f174, f245;
add.f32 f1821, f1853, f247;
sub.f32 f279, f1853, f247;
add.f32 f280, f178, f250;
sub.f32 f282, f178, f250;
add.f32 f1820, f1852, f252;
sub.f32 f283, f1852, f252;
add.f32 f284, f182, f255;
sub.f32 f286, f182, f255;
add.f32 f1819, f1851, f257;
sub.f32 f287, f1851, f257;
add.f32 f288, f172, f230;
sub.f32 f290, f172, f230;
sub.f32 f1818, f173, f229;
add.f32 f291, f173, f229;
add.f32 f292, f176, f260;
sub.f32 f294, f176, f260;
add.f32 f1817, f177, f262;
sub.f32 f295, f177, f262;
add.f32 f296, f180, f265;
sub.f32 f298, f180, f265;
add.f32 f1816, f181, f266;
sub.f32 f299, f181, f266;
add.f32 f300, f184, f269;
sub.f32 f302, f184, f269;
add.f32 f1815, f185, f271;
sub.f32 f303, f185, f271;
add.f32 f304, %68, %100;
sub.f32 f306, %68, %100;
add.f32 f1812, %141, %140;
sub.f32 f307, %141, %140;
add.f32 f308, %84, %116;
sub.f32 f310, %84, %116;
add.f32 f1810, %85, %142;
sub.f32 f311, %85, %142;
add.f32 f312, f304, f308;
sub.f32 f314, f304, f308;
add.f32 f1809, f1812, f1810;
sub.f32 f315, f1812, f1810;
add.f32 f316, f306, f311;
sub.f32 f318, f306, f311;
sub.f32 f1808, f307, f310;
add.f32 f319, f307, f310;
add.f32 f320, %76, %108;
sub.f32 f322, %76, %108;
add.f32 f1806, %143, %109;
sub.f32 f323, %143, %109;
add.f32 f324, %92, %124;
sub.f32 f326, %92, %124;
add.f32 f1803, %145, %144;
sub.f32 f327, %145, %144;
add.f32 f328, f320, f324;
sub.f32 f330, f320, f324;
add.f32 f1802, f1806, f1803;
sub.f32 f331, f1806, f1803;
add.f32 f332, f322, f327;
sub.f32 f334, f322, f327;
sub.f32 f1801, f323, f326;
add.f32 f335, f323, f326;
mul.f32 f1799, f332, 0f3F3504F3;
mul.f32 f1800, f1801, 0fBF3504F3;
sub.f32 f338, f1799, f1800;
mul.f32 f339, f1801, 0f3F3504F3;
fma.rn.f32 f340, f332, 0fBF3504F3, f339;
mul.f32 f341, f334, 0fBF3504F3;
mul.f32 f342, f335, 0fBF3504F3;
sub.f32 f343, f341, f342;
add.f32 f344, f341, f342;
add.f32 f345, f312, f328;
sub.f32 f347, f312, f328;
add.f32 f1798, f1809, f1802;
sub.f32 f348, f1809, f1802;
add.f32 f349, f316, f338;
sub.f32 f351, f316, f338;
add.f32 f1797, f1808, f340;
sub.f32 f352, f1808, f340;
add.f32 f353, f314, f331;
sub.f32 f355, f314, f331;
sub.f32 f1796, f315, f330;
add.f32 f356, f315, f330;
add.f32 f357, f318, f343;
sub.f32 f359, f318, f343;
add.f32 f1795, f319, f344;
sub.f32 f360, f319, f344;
add.f32 f361, %72, %104;
sub.f32 f363, %72, %104;
add.f32 f1793, %73, %146;
sub.f32 f364, %73, %146;
add.f32 f365, %88, %120;
sub.f32 f367, %88, %120;
add.f32 f1791, %147, %121;
sub.f32 f368, %147, %121;
add.f32 f369, f361, f365;
sub.f32 f371, f361, f365;
add.f32 f1790, f1793, f1791;
sub.f32 f372, f1793, f1791;
add.f32 f373, f363, f368;
sub.f32 f375, f363, f368;
sub.f32 f1789, f364, f367;
add.f32 f376, f364, f367;
add.f32 f377, %80, %112;
sub.f32 f379, %80, %112;
add.f32 f1786, %148, %149;
sub.f32 f380, %148, %149;
add.f32 f381, %96, %128;
sub.f32 f383, %96, %128;
add.f32 f1785, %97, %129;
sub.f32 f384, %97, %129;
add.f32 f385, f377, f381;
sub.f32 f387, f377, f381;
add.f32 f1784, f1786, f1785;
sub.f32 f388, f1786, f1785;
add.f32 f389, f379, f384;
sub.f32 f391, f379, f384;
sub.f32 f1783, f380, f383;
add.f32 f392, f380, f383;
mul.f32 f1781, f389, 0f3F3504F3;
mul.f32 f1782, f1783, 0fBF3504F3;
sub.f32 f395, f1781, f1782;
mul.f32 f396, f1783, 0f3F3504F3;
fma.rn.f32 f397, f389, 0fBF3504F3, f396;
mul.f32 f398, f391, 0fBF3504F3;
mul.f32 f399, f392, 0fBF3504F3;
sub.f32 f400, f398, f399;
add.f32 f401, f398, f399;
add.f32 f402, f369, f385;
sub.f32 f404, f369, f385;
add.f32 f1780, f1790, f1784;
sub.f32 f405, f1790, f1784;
add.f32 f406, f373, f395;
sub.f32 f408, f373, f395;
add.f32 f1779, f1789, f397;
sub.f32 f409, f1789, f397;
add.f32 f410, f371, f388;
sub.f32 f412, f371, f388;
sub.f32 f1778, f372, f387;
add.f32 f413, f372, f387;
add.f32 f414, f375, f400;
sub.f32 f416, f375, f400;
add.f32 f1777, f376, f401;
sub.f32 f417, f376, f401;
mul.f32 f419, f1779, 0fBEC3EF15;
mul.f32 f1776, f406, 0f3F6C835E;
sub.f32 f420, f1776, f419;
mul.f32 f421, f1779, 0f3F6C835E;
fma.rn.f32 f422, f406, 0fBEC3EF15, f421;
mul.f32 f424, f1778, 0fBF3504F3;
mul.f32 f1775, f410, 0f3F3504F3;
sub.f32 f425, f1775, f424;
mul.f32 f426, f1778, 0f3F3504F3;
fma.rn.f32 f427, f410, 0fBF3504F3, f426;
mul.f32 f1773, f414, 0f3EC3EF15;
mul.f32 f1774, f1777, 0fBF6C835E;
sub.f32 f430, f1773, f1774;
mul.f32 f431, f1777, 0f3EC3EF15;
fma.rn.f32 f432, f414, 0fBF6C835E, f431;
mul.f32 f1771, f408, 0fBEC3EF15;
mul.f32 f1772, f409, 0fBF6C835E;
sub.f32 f435, f1771, f1772;
mul.f32 f436, f409, 0fBEC3EF15;
fma.rn.f32 f437, f408, 0fBF6C835E, f436;
mul.f32 f438, f412, 0fBF3504F3;
mul.f32 f439, f413, 0fBF3504F3;
sub.f32 f440, f438, f439;
add.f32 f441, f438, f439;
mul.f32 f443, f417, 0fBEC3EF15;
mul.f32 f1770, f416, 0fBF6C835E;
sub.f32 f444, f1770, f443;
mul.f32 f445, f417, 0fBF6C835E;
fma.rn.f32 f446, f416, 0fBEC3EF15, f445;
add.f32 f447, f345, f402;
sub.f32 f449, f345, f402;
add.f32 f1769, f1798, f1780;
sub.f32 f450, f1798, f1780;
add.f32 f451, f349, f420;
sub.f32 f453, f349, f420;
add.f32 f1768, f1797, f422;
sub.f32 f454, f1797, f422;
add.f32 f455, f353, f425;
sub.f32 f457, f353, f425;
add.f32 f1767, f1796, f427;
sub.f32 f458, f1796, f427;
add.f32 f459, f357, f430;
sub.f32 f461, f357, f430;
add.f32 f1766, f1795, f432;
sub.f32 f462, f1795, f432;
add.f32 f463, f347, f405;
sub.f32 f465, f347, f405;
sub.f32 f1765, f348, f404;
add.f32 f466, f348, f404;
add.f32 f467, f351, f435;
sub.f32 f469, f351, f435;
add.f32 f1764, f352, f437;
sub.f32 f470, f352, f437;
add.f32 f471, f355, f440;
sub.f32 f473, f355, f440;
add.f32 f1763, f356, f441;
sub.f32 f474, f356, f441;
add.f32 f475, f359, f444;
sub.f32 f477, f359, f444;
add.f32 f1762, f360, f446;
sub.f32 f478, f360, f446;
mul.f32 f480, f1768, 0fBE47C5C2;
mul.f32 f1761, f451, 0f3F7B14BE;
sub.f32 f481, f1761, f480;
mul.f32 f482, f1768, 0f3F7B14BE;
fma.rn.f32 f483, f451, 0fBE47C5C2, f482;
mul.f32 f485, f1767, 0fBEC3EF15;
mul.f32 f1760, f455, 0f3F6C835E;
sub.f32 f486, f1760, f485;
mul.f32 f487, f1767, 0f3F6C835E;
fma.rn.f32 f488, f455, 0fBEC3EF15, f487;
mul.f32 f490, f1766, 0fBF0E39DA;
mul.f32 f1759, f459, 0f3F54DB31;
sub.f32 f491, f1759, f490;
mul.f32 f492, f1766, 0f3F54DB31;
fma.rn.f32 f493, f459, 0fBF0E39DA, f492;
mul.f32 f495, f1765, 0fBF3504F3;
mul.f32 f1758, f463, 0f3F3504F3;
sub.f32 f496, f1758, f495;
mul.f32 f497, f1765, 0f3F3504F3;
fma.rn.f32 f498, f463, 0fBF3504F3, f497;
mul.f32 f1756, f467, 0f3F0E39DA;
mul.f32 f1757, f1764, 0fBF54DB31;
sub.f32 f501, f1756, f1757;
mul.f32 f502, f1764, 0f3F0E39DA;
fma.rn.f32 f503, f467, 0fBF54DB31, f502;
mul.f32 f1754, f471, 0f3EC3EF15;
mul.f32 f1755, f1763, 0fBF6C835E;
sub.f32 f506, f1754, f1755;
mul.f32 f507, f1763, 0f3EC3EF15;
fma.rn.f32 f508, f471, 0fBF6C835E, f507;
mul.f32 f1752, f475, 0f3E47C5C2;
mul.f32 f1753, f1762, 0fBF7B14BE;
sub.f32 f511, f1752, f1753;
mul.f32 f512, f1762, 0f3E47C5C2;
fma.rn.f32 f513, f475, 0fBF7B14BE, f512;
mul.f32 f1750, f453, 0fBE47C5C2;
mul.f32 f1751, f454, 0fBF7B14BE;
sub.f32 f516, f1750, f1751;
mul.f32 f517, f454, 0fBE47C5C2;
fma.rn.f32 f518, f453, 0fBF7B14BE, f517;
mul.f32 f520, f458, 0fBF6C835E;
mul.f32 f1749, f457, 0fBEC3EF15;
sub.f32 f521, f1749, f520;
mul.f32 f522, f458, 0fBEC3EF15;
fma.rn.f32 f523, f457, 0fBF6C835E, f522;
mul.f32 f525, f462, 0fBF54DB31;
mul.f32 f1748, f461, 0fBF0E39DA;
sub.f32 f526, f1748, f525;
mul.f32 f527, f462, 0fBF0E39DA;
fma.rn.f32 f528, f461, 0fBF54DB31, f527;
mul.f32 f529, f465, 0fBF3504F3;
mul.f32 f530, f466, 0fBF3504F3;
sub.f32 f531, f529, f530;
add.f32 f532, f529, f530;
mul.f32 f1746, f469, 0fBF54DB31;
mul.f32 f1747, f470, 0fBF0E39DA;
sub.f32 f535, f1746, f1747;
mul.f32 f536, f470, 0fBF54DB31;
fma.rn.f32 f537, f469, 0fBF0E39DA, f536;
mul.f32 f539, f474, 0fBEC3EF15;
mul.f32 f1745, f473, 0fBF6C835E;
sub.f32 f540, f1745, f539;
mul.f32 f541, f474, 0fBF6C835E;
fma.rn.f32 f542, f473, 0fBEC3EF15, f541;
mul.f32 f544, f478, 0fBE47C5C2;
mul.f32 f1744, f477, 0fBF7B14BE;
sub.f32 f545, f1744, f544;
mul.f32 f546, f478, 0fBF7B14BE;
fma.rn.f32 f547, f477, 0fBE47C5C2, f546;
add.f32 f550, f276, f481;
sub.f32 f552, f276, f481;
add.f32 f1743, f1821, f483;
sub.f32 f553, f1821, f483;
add.f32 f554, f280, f486;
sub.f32 f556, f280, f486;
add.f32 f1742, f1820, f488;
sub.f32 f557, f1820, f488;
add.f32 f558, f284, f491;
sub.f32 f560, f284, f491;
add.f32 f1741, f1819, f493;
sub.f32 f561, f1819, f493;
add.f32 f562, f288, f496;
sub.f32 f564, f288, f496;
add.f32 f1740, f1818, f498;
sub.f32 f565, f1818, f498;
add.f32 f566, f292, f501;
sub.f32 f568, f292, f501;
add.f32 f1739, f1817, f503;
sub.f32 f569, f1817, f503;
add.f32 f570, f296, f506;
sub.f32 f572, f296, f506;
add.f32 f1738, f1816, f508;
sub.f32 f573, f1816, f508;
add.f32 f574, f300, f511;
sub.f32 f576, f300, f511;
add.f32 f1737, f1815, f513;
sub.f32 f577, f1815, f513;
add.f32 f578, f274, f450;
sub.f32 f580, f274, f450;
sub.f32 f1736, f275, f449;
add.f32 f581, f275, f449;
add.f32 f582, f278, f516;
sub.f32 f584, f278, f516;
add.f32 f1735, f279, f518;
sub.f32 f585, f279, f518;
add.f32 f586, f282, f521;
sub.f32 f588, f282, f521;
add.f32 f1734, f283, f523;
sub.f32 f589, f283, f523;
add.f32 f590, f286, f526;
sub.f32 f592, f286, f526;
add.f32 f1733, f287, f528;
sub.f32 f593, f287, f528;
add.f32 f594, f290, f531;
sub.f32 f596, f290, f531;
add.f32 f1732, f291, f532;
sub.f32 f597, f291, f532;
add.f32 f598, f294, f535;
sub.f32 f600, f294, f535;
add.f32 f1731, f295, f537;
sub.f32 f601, f295, f537;
add.f32 f602, f298, f540;
sub.f32 f604, f298, f540;
add.f32 f1730, f299, f542;
sub.f32 f605, f299, f542;
add.f32 f606, f302, f545;
sub.f32 f608, f302, f545;
add.f32 f1729, f303, f547;
sub.f32 f609, f303, f547;
mov.u32 r15, %tid.x;
shl.b32 r7, r15, 8;
and.b32 r8, r7, -8192;
add.s32 r9, r4, r8;
and.b32 r14, r15, 31;
shl.b32 r10, r15, 3;
cvt.u64.u32 rd2, r10;
and.b64 rd3, rd2, 248;
mov.u64 rd4, %65;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f610, f611}, [rd5];
mul.f32 f615, f611, f1743;
mul.f32 f616, f610, f1743;
mul.f32 f618, f611, f611;
mul.f32 f1728, f610, f610;
sub.f32 f619, f1728, f618;
mul.f32 f620, f611, f610;
fma.rn.f32 f621, f611, f610, f620;
mul.f32 f623, f621, f1742;
mul.f32 f624, f619, f1742;
mul.f32 f626, f611, f621;
mul.f32 f1727, f610, f619;
sub.f32 f627, f1727, f626;
mul.f32 f1726, f619, f554;
mul.f32 f628, f610, f621;
fma.rn.f32 f629, f611, f619, f628;
mul.f32 f631, f629, f1741;
mul.f32 f632, f627, f1741;
mul.f32 f1724, f610, f627;
mul.f32 f1725, f611, f629;
sub.f32 f635, f1724, f1725;
mul.f32 f1723, f627, f558;
mul.f32 f636, f610, f629;
fma.rn.f32 f637, f611, f627, f636;
mul.f32 f639, f637, f1740;
mul.f32 f640, f635, f1740;
mul.f32 f642, f611, f637;
mul.f32 f1722, f610, f635;
sub.f32 f643, f1722, f642;
mul.f32 f1721, f635, f562;
mul.f32 f644, f610, f637;
fma.rn.f32 f645, f611, f635, f644;
mul.f32 f647, f645, f1739;
mul.f32 f648, f643, f1739;
mul.f32 f1719, f610, f643;
mul.f32 f1720, f611, f645;
sub.f32 f651, f1719, f1720;
mul.f32 f1718, f643, f566;
mul.f32 f652, f610, f645;
fma.rn.f32 f653, f611, f643, f652;
mul.f32 f655, f653, f1738;
mul.f32 f656, f651, f1738;
mul.f32 f658, f611, f653;
mul.f32 f1717, f610, f651;
sub.f32 f659, f1717, f658;
mul.f32 f1716, f651, f570;
mul.f32 f660, f610, f653;
fma.rn.f32 f661, f611, f651, f660;
mul.f32 f663, f661, f1737;
mul.f32 f664, f659, f1737;
mul.f32 f666, f611, f661;
mul.f32 f1715, f610, f659;
sub.f32 f667, f1715, f666;
mul.f32 f1714, f659, f574;
mul.f32 f668, f610, f661;
fma.rn.f32 f669, f611, f659, f668;
mul.f32 f671, f669, f1736;
mul.f32 f672, f667, f1736;
mul.f32 f1712, f610, f667;
mul.f32 f1713, f611, f669;
sub.f32 f675, f1712, f1713;
mul.f32 f1711, f667, f578;
mul.f32 f676, f610, f669;
fma.rn.f32 f677, f611, f667, f676;
mul.f32 f679, f677, f1735;
mul.f32 f680, f675, f1735;
mul.f32 f682, f611, f677;
mul.f32 f1710, f610, f675;
sub.f32 f683, f1710, f682;
mul.f32 f1709, f675, f582;
mul.f32 f684, f610, f677;
fma.rn.f32 f685, f611, f675, f684;
mul.f32 f687, f685, f1734;
mul.f32 f688, f683, f1734;
mul.f32 f690, f611, f685;
mul.f32 f1708, f610, f683;
sub.f32 f691, f1708, f690;
mul.f32 f1707, f683, f586;
mul.f32 f692, f610, f685;
fma.rn.f32 f693, f611, f683, f692;
mul.f32 f695, f693, f1733;
mul.f32 f696, f691, f1733;
mul.f32 f1705, f610, f691;
mul.f32 f1706, f611, f693;
sub.f32 f699, f1705, f1706;
mul.f32 f1704, f691, f590;
mul.f32 f700, f610, f693;
fma.rn.f32 f701, f611, f691, f700;
mul.f32 f703, f701, f1732;
mul.f32 f704, f699, f1732;
mul.f32 f706, f611, f701;
mul.f32 f1703, f610, f699;
sub.f32 f707, f1703, f706;
mul.f32 f1702, f699, f594;
mul.f32 f708, f610, f701;
fma.rn.f32 f709, f611, f699, f708;
mul.f32 f711, f709, f1731;
mul.f32 f712, f707, f1731;
mul.f32 f1700, f610, f707;
mul.f32 f1701, f611, f709;
sub.f32 f715, f1700, f1701;
mul.f32 f1699, f707, f598;
mul.f32 f716, f610, f709;
fma.rn.f32 f717, f611, f707, f716;
mul.f32 f719, f717, f1730;
mul.f32 f720, f715, f1730;
mul.f32 f722, f611, f717;
mul.f32 f1698, f610, f715;
sub.f32 f723, f1698, f722;
mul.f32 f1697, f715, f602;
mul.f32 f724, f610, f717;
fma.rn.f32 f725, f611, f715, f724;
mul.f32 f727, f725, f1729;
mul.f32 f728, f723, f1729;
mul.f32 f730, f611, f725;
mul.f32 f1696, f610, f723;
sub.f32 f731, f1696, f730;
mul.f32 f1695, f723, f606;
mul.f32 f732, f610, f725;
fma.rn.f32 f733, f611, f723, f732;
sub.f32 f1694, f1822, f1769;
mul.f32 f735, f733, f1694;
mul.f32 f736, f731, f1694;
mul.f32 f1692, f610, f731;
mul.f32 f1693, f611, f733;
sub.f32 f739, f1692, f1693;
sub.f32 f1691, f272, f447;
mul.f32 f1690, f731, f1691;
mul.f32 f740, f610, f733;
fma.rn.f32 f741, f611, f731, f740;
mul.f32 f743, f741, f553;
mul.f32 f744, f739, f553;
mul.f32 f746, f611, f741;
mul.f32 f1689, f610, f739;
sub.f32 f747, f1689, f746;
mul.f32 f1688, f739, f552;
mul.f32 f748, f610, f741;
fma.rn.f32 f749, f611, f739, f748;
mul.f32 f751, f749, f557;
mul.f32 f752, f747, f557;
mul.f32 f754, f611, f749;
mul.f32 f1687, f610, f747;
sub.f32 f755, f1687, f754;
mul.f32 f1686, f747, f556;
mul.f32 f756, f610, f749;
fma.rn.f32 f757, f611, f747, f756;
mul.f32 f759, f757, f561;
mul.f32 f760, f755, f561;
mul.f32 f1684, f610, f755;
mul.f32 f1685, f611, f757;
sub.f32 f763, f1684, f1685;
mul.f32 f1683, f755, f560;
mul.f32 f764, f610, f757;
fma.rn.f32 f765, f611, f755, f764;
mul.f32 f767, f765, f565;
mul.f32 f768, f763, f565;
mul.f32 f770, f611, f765;
mul.f32 f1682, f610, f763;
sub.f32 f771, f1682, f770;
mul.f32 f1681, f763, f564;
mul.f32 f772, f610, f765;
fma.rn.f32 f773, f611, f763, f772;
mul.f32 f775, f773, f569;
mul.f32 f776, f771, f569;
mul.f32 f1679, f610, f771;
mul.f32 f1680, f611, f773;
sub.f32 f779, f1679, f1680;
mul.f32 f1678, f771, f568;
mul.f32 f780, f610, f773;
fma.rn.f32 f781, f611, f771, f780;
mul.f32 f783, f781, f573;
mul.f32 f784, f779, f573;
mul.f32 f786, f611, f781;
mul.f32 f1677, f610, f779;
sub.f32 f787, f1677, f786;
mul.f32 f1676, f779, f572;
mul.f32 f788, f610, f781;
fma.rn.f32 f789, f611, f779, f788;
mul.f32 f791, f789, f577;
mul.f32 f792, f787, f577;
mul.f32 f794, f611, f789;
mul.f32 f1675, f610, f787;
sub.f32 f795, f1675, f794;
mul.f32 f1674, f787, f576;
mul.f32 f796, f610, f789;
fma.rn.f32 f797, f611, f787, f796;
mul.f32 f799, f797, f581;
mul.f32 f800, f795, f581;
mul.f32 f1672, f610, f795;
mul.f32 f1673, f611, f797;
sub.f32 f803, f1672, f1673;
mul.f32 f1671, f795, f580;
mul.f32 f804, f610, f797;
fma.rn.f32 f805, f611, f795, f804;
mul.f32 f807, f805, f585;
mul.f32 f808, f803, f585;
mul.f32 f810, f611, f805;
mul.f32 f1670, f610, f803;
sub.f32 f811, f1670, f810;
mul.f32 f1669, f803, f584;
mul.f32 f812, f610, f805;
fma.rn.f32 f813, f611, f803, f812;
mul.f32 f815, f813, f589;
mul.f32 f816, f811, f589;
mul.f32 f818, f611, f813;
mul.f32 f1668, f610, f811;
sub.f32 f819, f1668, f818;
mul.f32 f1667, f811, f588;
mul.f32 f820, f610, f813;
fma.rn.f32 f821, f611, f811, f820;
mul.f32 f823, f821, f593;
mul.f32 f824, f819, f593;
mul.f32 f1665, f610, f819;
mul.f32 f1666, f611, f821;
sub.f32 f827, f1665, f1666;
mul.f32 f1664, f819, f592;
mul.f32 f828, f610, f821;
fma.rn.f32 f829, f611, f819, f828;
mul.f32 f831, f829, f597;
mul.f32 f832, f827, f597;
mul.f32 f834, f611, f829;
mul.f32 f1663, f610, f827;
sub.f32 f835, f1663, f834;
mul.f32 f1662, f827, f596;
mul.f32 f836, f610, f829;
fma.rn.f32 f837, f611, f827, f836;
mul.f32 f839, f837, f601;
mul.f32 f840, f835, f601;
mul.f32 f1660, f610, f835;
mul.f32 f1661, f611, f837;
sub.f32 f843, f1660, f1661;
mul.f32 f1659, f835, f600;
mul.f32 f844, f610, f837;
fma.rn.f32 f845, f611, f835, f844;
mul.f32 f847, f845, f605;
mul.f32 f848, f843, f605;
mul.f32 f850, f611, f845;
mul.f32 f1658, f610, f843;
sub.f32 f851, f1658, f850;
mul.f32 f1657, f610, f550;
mul.f32 f852, f610, f845;
mul.f32 f1656, f843, f604;
fma.rn.f32 f853, f611, f843, f852;
mul.f32 f854, f851, f608;
mul.f32 f855, f853, f609;
mul.f32 f856, f851, f609;
mov.u32 r17, %tid.x;
shl.b32 r16, r17, 8;
barrier.sync 0;
and.b32 r11, r16, 7936;
add.s32 r12, r9, r11;
sub.f32 f1870, f1822, f1769;
mul.f32 f1869, f733, f1870;
add.f32 f857, f1822, f1769;
mov.u32 r19, %tid.x;
and.b32 r18, r19, 31;
sub.f32 f1871, f272, f447;
add.f32 f858, f272, f447;
mov.u32 r21, %tid.x;
and.b32 r20, r21, 31;
mov.u32 r23, %tid.x;
and.b32 r22, r23, 31;
fma.rn.f32 f859, f611, f550, f616;
sub.f32 f860, f1657, f615;
st.shared.v4.f32 [r12], {f858, f857, f860, f859};
fma.rn.f32 f861, f621, f554, f624;
sub.f32 f862, f1726, f623;
fma.rn.f32 f863, f629, f558, f632;
sub.f32 f864, f1723, f631;
st.shared.v4.f32 [r12+16], {f862, f861, f864, f863};
fma.rn.f32 f865, f637, f562, f640;
sub.f32 f866, f1721, f639;
sub.f32 f867, f1718, f647;
fma.rn.f32 f868, f645, f566, f648;
st.shared.v4.f32 [r12+32], {f866, f865, f867, f868};
fma.rn.f32 f869, f653, f570, f656;
sub.f32 f870, f1716, f655;
fma.rn.f32 f871, f661, f574, f664;
sub.f32 f872, f1714, f663;
st.shared.v4.f32 [r12+48], {f870, f869, f872, f871};
fma.rn.f32 f873, f669, f578, f672;
sub.f32 f874, f1711, f671;
fma.rn.f32 f875, f677, f582, f680;
sub.f32 f876, f1709, f679;
st.shared.v4.f32 [r12+64], {f874, f873, f876, f875};
fma.rn.f32 f877, f685, f586, f688;
sub.f32 f878, f1707, f687;
fma.rn.f32 f879, f693, f590, f696;
sub.f32 f880, f1704, f695;
st.shared.v4.f32 [r12+80], {f878, f877, f880, f879};
fma.rn.f32 f881, f701, f594, f704;
sub.f32 f882, f1702, f703;
fma.rn.f32 f883, f709, f598, f712;
sub.f32 f884, f1699, f711;
st.shared.v4.f32 [r12+96], {f882, f881, f884, f883};
fma.rn.f32 f885, f717, f602, f720;
sub.f32 f886, f1697, f719;
fma.rn.f32 f887, f725, f606, f728;
sub.f32 f888, f1695, f727;
st.shared.v4.f32 [r12+112], {f886, f885, f888, f887};
fma.rn.f32 f889, f733, f1871, f736;
sub.f32 f890, f1690, f1869;
fma.rn.f32 f891, f741, f552, f744;
sub.f32 f892, f1688, f743;
st.shared.v4.f32 [r12+128], {f890, f889, f892, f891};
fma.rn.f32 f893, f749, f556, f752;
sub.f32 f894, f1686, f751;
fma.rn.f32 f895, f757, f560, f760;
sub.f32 f896, f1683, f759;
st.shared.v4.f32 [r12+144], {f894, f893, f896, f895};
fma.rn.f32 f897, f765, f564, f768;
sub.f32 f898, f1681, f767;
fma.rn.f32 f899, f773, f568, f776;
sub.f32 f900, f1678, f775;
st.shared.v4.f32 [r12+160], {f898, f897, f900, f899};
fma.rn.f32 f901, f781, f572, f784;
sub.f32 f902, f1676, f783;
fma.rn.f32 f903, f789, f576, f792;
sub.f32 f904, f1674, f791;
st.shared.v4.f32 [r12+176], {f902, f901, f904, f903};
fma.rn.f32 f905, f797, f580, f800;
sub.f32 f906, f1671, f799;
fma.rn.f32 f907, f805, f584, f808;
sub.f32 f908, f1669, f807;
st.shared.v4.f32 [r12+192], {f906, f905, f908, f907};
fma.rn.f32 f909, f813, f588, f816;
sub.f32 f910, f1667, f815;
fma.rn.f32 f911, f821, f592, f824;
sub.f32 f912, f1664, f823;
st.shared.v4.f32 [r12+208], {f910, f909, f912, f911};
fma.rn.f32 f913, f829, f596, f832;
sub.f32 f914, f1662, f831;
fma.rn.f32 f915, f837, f600, f840;
sub.f32 f916, f1659, f839;
st.shared.v4.f32 [r12+224], {f914, f913, f916, f915};
fma.rn.f32 f917, f845, f604, f848;
sub.f32 f918, f1656, f847;
fma.rn.f32 f919, f853, f608, f856;
sub.f32 f920, f854, f855;
st.shared.v4.f32 [r12+240], {f918, f917, f920, f919};
barrier.sync 0;
mad.lo.s32 r13, r22, -248, r12;
ld.shared.v2.f32 {f921, f922}, [r13];
ld.shared.v2.f32 {f925, f926}, [r13+256];
ld.shared.v2.f32 {f929, f930}, [r13+512];
ld.shared.v2.f32 {f933, f934}, [r13+768];
ld.shared.v2.f32 {f937, f938}, [r13+1024];
ld.shared.v2.f32 {f941, f942}, [r13+1280];
ld.shared.v2.f32 {f945, f946}, [r13+1536];
ld.shared.v2.f32 {f949, f950}, [r13+1792];
ld.shared.v2.f32 {f953, f954}, [r13+2048];
ld.shared.v2.f32 {f957, f958}, [r13+2304];
ld.shared.v2.f32 {f961, f962}, [r13+2560];
ld.shared.v2.f32 {f965, f966}, [r13+2816];
ld.shared.v2.f32 {f969, f970}, [r13+3072];
ld.shared.v2.f32 {f973, f974}, [r13+3328];
ld.shared.v2.f32 {f977, f978}, [r13+3584];
ld.shared.v2.f32 {f981, f982}, [r13+3840];
ld.shared.v2.f32 {f985, f986}, [r13+4096];
ld.shared.v2.f32 {f989, f990}, [r13+4352];
ld.shared.v2.f32 {f993, f994}, [r13+4608];
ld.shared.v2.f32 {f997, f998}, [r13+4864];
ld.shared.v2.f32 {f1001, f1002}, [r13+5120];
ld.shared.v2.f32 {f1005, f1006}, [r13+5376];
ld.shared.v2.f32 {f1009, f1010}, [r13+5632];
ld.shared.v2.f32 {f1013, f1014}, [r13+5888];
ld.shared.v2.f32 {f1017, f1018}, [r13+6144];
ld.shared.v2.f32 {f1021, f1022}, [r13+6400];
ld.shared.v2.f32 {f1025, f1026}, [r13+6656];
ld.shared.v2.f32 {f1029, f1030}, [r13+6912];
ld.shared.v2.f32 {f1033, f1034}, [r13+7168];
ld.shared.v2.f32 {f1037, f1038}, [r13+7424];
ld.shared.v2.f32 {f1041, f1042}, [r13+7680];
ld.shared.v2.f32 {f1045, f1046}, [r13+7936];
add.f32 f1049, f921, f985;
sub.f32 f1051, f921, f985;
add.f32 f1655, f922, f986;
sub.f32 f1052, f922, f986;
add.f32 f1053, f953, f1017;
sub.f32 f1055, f953, f1017;
add.f32 f1654, f954, f1018;
sub.f32 f1056, f954, f1018;
add.f32 f1057, f1049, f1053;
sub.f32 f1059, f1049, f1053;
add.f32 f1653, f1655, f1654;
sub.f32 f1060, f1655, f1654;
add.f32 f1061, f1051, f1056;
sub.f32 f1063, f1051, f1056;
sub.f32 f1652, f1052, f1055;
add.f32 f1064, f1052, f1055;
add.f32 f1065, f937, f1001;
sub.f32 f1067, f937, f1001;
add.f32 f1651, f938, f1002;
sub.f32 f1068, f938, f1002;
add.f32 f1069, f969, f1033;
sub.f32 f1071, f969, f1033;
add.f32 f1650, f970, f1034;
sub.f32 f1072, f970, f1034;
add.f32 f1073, f1065, f1069;
sub.f32 f1075, f1065, f1069;
add.f32 f1649, f1651, f1650;
sub.f32 f1076, f1651, f1650;
add.f32 f1077, f1067, f1072;
sub.f32 f1079, f1067, f1072;
sub.f32 f1648, f1068, f1071;
add.f32 f1080, f1068, f1071;
mul.f32 f1082, f1648, 0fBF3504F3;
mul.f32 f1647, f1077, 0f3F3504F3;
sub.f32 f1083, f1647, f1082;
mul.f32 f1084, f1648, 0f3F3504F3;
fma.rn.f32 f1085, f1077, 0fBF3504F3, f1084;
mul.f32 f1086, f1079, 0fBF3504F3;
mul.f32 f1087, f1080, 0fBF3504F3;
sub.f32 f1088, f1086, f1087;
add.f32 f1089, f1086, f1087;
add.f32 f1090, f1057, f1073;
sub.f32 f1092, f1057, f1073;
add.f32 f1646, f1653, f1649;
sub.f32 f1093, f1653, f1649;
add.f32 f1094, f1061, f1083;
sub.f32 f1096, f1061, f1083;
add.f32 f1645, f1652, f1085;
sub.f32 f1097, f1652, f1085;
add.f32 f1098, f1059, f1076;
sub.f32 f1100, f1059, f1076;
sub.f32 f1644, f1060, f1075;
add.f32 f1101, f1060, f1075;
add.f32 f1102, f1063, f1088;
sub.f32 f1104, f1063, f1088;
add.f32 f1643, f1064, f1089;
sub.f32 f1105, f1064, f1089;
add.f32 f1106, f929, f993;
sub.f32 f1108, f929, f993;
add.f32 f1642, f930, f994;
sub.f32 f1109, f930, f994;
add.f32 f1110, f961, f1025;
sub.f32 f1112, f961, f1025;
add.f32 f1641, f962, f1026;
sub.f32 f1113, f962, f1026;
add.f32 f1114, f1106, f1110;
sub.f32 f1116, f1106, f1110;
add.f32 f1640, f1642, f1641;
sub.f32 f1117, f1642, f1641;
add.f32 f1118, f1108, f1113;
sub.f32 f1120, f1108, f1113;
sub.f32 f1639, f1109, f1112;
add.f32 f1121, f1109, f1112;
add.f32 f1122, f945, f1009;
sub.f32 f1124, f945, f1009;
add.f32 f1638, f946, f1010;
sub.f32 f1125, f946, f1010;
add.f32 f1126, f977, f1041;
sub.f32 f1128, f977, f1041;
add.f32 f1637, f978, f1042;
sub.f32 f1129, f978, f1042;
add.f32 f1130, f1122, f1126;
sub.f32 f1132, f1122, f1126;
add.f32 f1636, f1638, f1637;
sub.f32 f1133, f1638, f1637;
add.f32 f1134, f1124, f1129;
sub.f32 f1136, f1124, f1129;
sub.f32 f1635, f1125, f1128;
add.f32 f1137, f1125, f1128;
mul.f32 f1139, f1635, 0fBF3504F3;
mul.f32 f1634, f1134, 0f3F3504F3;
sub.f32 f1140, f1634, f1139;
mul.f32 f1141, f1635, 0f3F3504F3;
fma.rn.f32 f1142, f1134, 0fBF3504F3, f1141;
mul.f32 f1143, f1136, 0fBF3504F3;
mul.f32 f1144, f1137, 0fBF3504F3;
sub.f32 f1145, f1143, f1144;
add.f32 f1146, f1143, f1144;
add.f32 f1147, f1114, f1130;
sub.f32 f1149, f1114, f1130;
add.f32 f1633, f1640, f1636;
sub.f32 f1150, f1640, f1636;
add.f32 f1151, f1118, f1140;
sub.f32 f1153, f1118, f1140;
add.f32 f1632, f1639, f1142;
sub.f32 f1154, f1639, f1142;
add.f32 f1155, f1116, f1133;
sub.f32 f1157, f1116, f1133;
sub.f32 f1631, f1117, f1132;
add.f32 f1158, f1117, f1132;
add.f32 f1159, f1120, f1145;
sub.f32 f1161, f1120, f1145;
add.f32 f1630, f1121, f1146;
sub.f32 f1162, f1121, f1146;
mul.f32 f1628, f1151, 0f3F6C835E;
mul.f32 f1629, f1632, 0fBEC3EF15;
sub.f32 f1165, f1628, f1629;
mul.f32 f1166, f1632, 0f3F6C835E;
fma.rn.f32 f1167, f1151, 0fBEC3EF15, f1166;
mul.f32 f1626, f1155, 0f3F3504F3;
mul.f32 f1627, f1631, 0fBF3504F3;
sub.f32 f1170, f1626, f1627;
mul.f32 f1171, f1631, 0f3F3504F3;
fma.rn.f32 f1172, f1155, 0fBF3504F3, f1171;
mul.f32 f1174, f1630, 0fBF6C835E;
mul.f32 f1625, f1159, 0f3EC3EF15;
sub.f32 f1175, f1625, f1174;
mul.f32 f1176, f1630, 0f3EC3EF15;
fma.rn.f32 f1177, f1159, 0fBF6C835E, f1176;
mul.f32 f1179, f1154, 0fBF6C835E;
mul.f32 f1624, f1153, 0fBEC3EF15;
sub.f32 f1180, f1624, f1179;
mul.f32 f1181, f1154, 0fBEC3EF15;
fma.rn.f32 f1182, f1153, 0fBF6C835E, f1181;
mul.f32 f1183, f1157, 0fBF3504F3;
mul.f32 f1184, f1158, 0fBF3504F3;
sub.f32 f1185, f1183, f1184;
add.f32 f1186, f1183, f1184;
mul.f32 f1622, f1161, 0fBF6C835E;
mul.f32 f1623, f1162, 0fBEC3EF15;
sub.f32 f1189, f1622, f1623;
mul.f32 f1190, f1162, 0fBF6C835E;
fma.rn.f32 f1191, f1161, 0fBEC3EF15, f1190;
add.f32 f1192, f1090, f1147;
sub.f32 f1194, f1090, f1147;
add.f32 f1621, f1646, f1633;
sub.f32 f1195, f1646, f1633;
add.f32 f1196, f1094, f1165;
sub.f32 f1198, f1094, f1165;
add.f32 f1620, f1645, f1167;
sub.f32 f1199, f1645, f1167;
add.f32 f1200, f1098, f1170;
sub.f32 f1202, f1098, f1170;
add.f32 f1619, f1644, f1172;
sub.f32 f1203, f1644, f1172;
add.f32 f1204, f1102, f1175;
sub.f32 f1206, f1102, f1175;
add.f32 f1618, f1643, f1177;
sub.f32 f1207, f1643, f1177;
add.f32 f1208, f1092, f1150;
sub.f32 f1210, f1092, f1150;
sub.f32 f1617, f1093, f1149;
add.f32 f1211, f1093, f1149;
add.f32 f1212, f1096, f1180;
sub.f32 f1214, f1096, f1180;
add.f32 f1616, f1097, f1182;
sub.f32 f1215, f1097, f1182;
add.f32 f1216, f1100, f1185;
sub.f32 f1218, f1100, f1185;
add.f32 f1615, f1101, f1186;
sub.f32 f1219, f1101, f1186;
add.f32 f1220, f1104, f1189;
sub.f32 f1222, f1104, f1189;
add.f32 f1614, f1105, f1191;
sub.f32 f1223, f1105, f1191;
add.f32 f1224, f925, f989;
sub.f32 f1226, f925, f989;
add.f32 f1613, f926, f990;
sub.f32 f1227, f926, f990;
add.f32 f1228, f957, f1021;
sub.f32 f1230, f957, f1021;
add.f32 f1612, f958, f1022;
sub.f32 f1231, f958, f1022;
add.f32 f1232, f1224, f1228;
sub.f32 f1234, f1224, f1228;
add.f32 f1611, f1613, f1612;
sub.f32 f1235, f1613, f1612;
add.f32 f1236, f1226, f1231;
sub.f32 f1238, f1226, f1231;
sub.f32 f1610, f1227, f1230;
add.f32 f1239, f1227, f1230;
add.f32 f1240, f941, f1005;
sub.f32 f1242, f941, f1005;
add.f32 f1609, f942, f1006;
sub.f32 f1243, f942, f1006;
add.f32 f1244, f973, f1037;
sub.f32 f1246, f973, f1037;
add.f32 f1608, f974, f1038;
sub.f32 f1247, f974, f1038;
add.f32 f1248, f1240, f1244;
sub.f32 f1250, f1240, f1244;
add.f32 f1607, f1609, f1608;
sub.f32 f1251, f1609, f1608;
add.f32 f1252, f1242, f1247;
sub.f32 f1254, f1242, f1247;
sub.f32 f1606, f1243, f1246;
add.f32 f1255, f1243, f1246;
mul.f32 f1257, f1606, 0fBF3504F3;
mul.f32 f1605, f1252, 0f3F3504F3;
sub.f32 f1258, f1605, f1257;
mul.f32 f1259, f1606, 0f3F3504F3;
fma.rn.f32 f1260, f1252, 0fBF3504F3, f1259;
mul.f32 f1261, f1254, 0fBF3504F3;
mul.f32 f1262, f1255, 0fBF3504F3;
sub.f32 f1263, f1261, f1262;
add.f32 f1264, f1261, f1262;
add.f32 f1265, f1232, f1248;
sub.f32 f1267, f1232, f1248;
add.f32 f1604, f1611, f1607;
sub.f32 f1268, f1611, f1607;
add.f32 f1269, f1236, f1258;
sub.f32 f1271, f1236, f1258;
add.f32 f1603, f1610, f1260;
sub.f32 f1272, f1610, f1260;
add.f32 f1273, f1234, f1251;
sub.f32 f1275, f1234, f1251;
sub.f32 f1602, f1235, f1250;
add.f32 f1276, f1235, f1250;
add.f32 f1277, f1238, f1263;
sub.f32 f1279, f1238, f1263;
add.f32 f1601, f1239, f1264;
sub.f32 f1280, f1239, f1264;
add.f32 f1281, f933, f997;
sub.f32 f1283, f933, f997;
add.f32 f1600, f934, f998;
sub.f32 f1284, f934, f998;
add.f32 f1285, f965, f1029;
sub.f32 f1287, f965, f1029;
add.f32 f1599, f966, f1030;
sub.f32 f1288, f966, f1030;
add.f32 f1289, f1281, f1285;
sub.f32 f1291, f1281, f1285;
add.f32 f1598, f1600, f1599;
sub.f32 f1292, f1600, f1599;
add.f32 f1293, f1283, f1288;
sub.f32 f1295, f1283, f1288;
sub.f32 f1597, f1284, f1287;
add.f32 f1296, f1284, f1287;
add.f32 f1297, f949, f1013;
sub.f32 f1299, f949, f1013;
add.f32 f1596, f950, f1014;
sub.f32 f1300, f950, f1014;
add.f32 f1301, f981, f1045;
sub.f32 f1303, f981, f1045;
add.f32 f1595, f982, f1046;
sub.f32 f1304, f982, f1046;
add.f32 f1305, f1297, f1301;
sub.f32 f1307, f1297, f1301;
add.f32 f1594, f1596, f1595;
sub.f32 f1308, f1596, f1595;
add.f32 f1309, f1299, f1304;
sub.f32 f1311, f1299, f1304;
sub.f32 f1593, f1300, f1303;
add.f32 f1312, f1300, f1303;
mul.f32 f1314, f1593, 0fBF3504F3;
mul.f32 f1592, f1309, 0f3F3504F3;
sub.f32 f1315, f1592, f1314;
mul.f32 f1316, f1593, 0f3F3504F3;
fma.rn.f32 f1317, f1309, 0fBF3504F3, f1316;
mul.f32 f1318, f1311, 0fBF3504F3;
mul.f32 f1319, f1312, 0fBF3504F3;
sub.f32 f1320, f1318, f1319;
add.f32 f1321, f1318, f1319;
add.f32 f1322, f1289, f1305;
sub.f32 f1324, f1289, f1305;
add.f32 f1591, f1598, f1594;
sub.f32 f1325, f1598, f1594;
add.f32 f1326, f1293, f1315;
sub.f32 f1328, f1293, f1315;
add.f32 f1590, f1597, f1317;
sub.f32 f1329, f1597, f1317;
add.f32 f1330, f1291, f1308;
sub.f32 f1332, f1291, f1308;
sub.f32 f1589, f1292, f1307;
add.f32 f1333, f1292, f1307;
add.f32 f1334, f1295, f1320;
sub.f32 f1336, f1295, f1320;
add.f32 f1588, f1296, f1321;
sub.f32 f1337, f1296, f1321;
mul.f32 f1586, f1326, 0f3F6C835E;
mul.f32 f1587, f1590, 0fBEC3EF15;
sub.f32 f1340, f1586, f1587;
mul.f32 f1341, f1590, 0f3F6C835E;
fma.rn.f32 f1342, f1326, 0fBEC3EF15, f1341;
mul.f32 f1584, f1330, 0f3F3504F3;
mul.f32 f1585, f1589, 0fBF3504F3;
sub.f32 f1345, f1584, f1585;
mul.f32 f1346, f1589, 0f3F3504F3;
fma.rn.f32 f1347, f1330, 0fBF3504F3, f1346;
mul.f32 f1582, f1334, 0f3EC3EF15;
mul.f32 f1583, f1588, 0fBF6C835E;
sub.f32 f1350, f1582, f1583;
mul.f32 f1351, f1588, 0f3EC3EF15;
fma.rn.f32 f1352, f1334, 0fBF6C835E, f1351;
mul.f32 f1580, f1328, 0fBEC3EF15;
mul.f32 f1581, f1329, 0fBF6C835E;
sub.f32 f1355, f1580, f1581;
mul.f32 f1356, f1329, 0fBEC3EF15;
fma.rn.f32 f1357, f1328, 0fBF6C835E, f1356;
mul.f32 f1358, f1332, 0fBF3504F3;
mul.f32 f1359, f1333, 0fBF3504F3;
sub.f32 f1360, f1358, f1359;
add.f32 f1361, f1358, f1359;
mul.f32 f1578, f1336, 0fBF6C835E;
mul.f32 f1579, f1337, 0fBEC3EF15;
sub.f32 f1364, f1578, f1579;
mul.f32 f1365, f1337, 0fBF6C835E;
fma.rn.f32 f1366, f1336, 0fBEC3EF15, f1365;
add.f32 f1367, f1265, f1322;
sub.f32 f1369, f1265, f1322;
add.f32 f1577, f1604, f1591;
sub.f32 f1370, f1604, f1591;
add.f32 f1371, f1269, f1340;
sub.f32 f1373, f1269, f1340;
add.f32 f1576, f1603, f1342;
sub.f32 f1374, f1603, f1342;
add.f32 f1375, f1273, f1345;
sub.f32 f1377, f1273, f1345;
add.f32 f1575, f1602, f1347;
sub.f32 f1378, f1602, f1347;
add.f32 f1379, f1277, f1350;
sub.f32 f1381, f1277, f1350;
add.f32 f1574, f1601, f1352;
sub.f32 f1382, f1601, f1352;
add.f32 f1383, f1267, f1325;
sub.f32 f1385, f1267, f1325;
sub.f32 f1573, f1268, f1324;
add.f32 f1386, f1268, f1324;
add.f32 f1387, f1271, f1355;
sub.f32 f1389, f1271, f1355;
add.f32 f1572, f1272, f1357;
sub.f32 f1390, f1272, f1357;
add.f32 f1391, f1275, f1360;
sub.f32 f1393, f1275, f1360;
add.f32 f1571, f1276, f1361;
sub.f32 f1394, f1276, f1361;
add.f32 f1395, f1279, f1364;
sub.f32 f1397, f1279, f1364;
add.f32 f1570, f1280, f1366;
sub.f32 f1398, f1280, f1366;
mul.f32 f1400, f1576, 0fBE47C5C2;
mul.f32 f1569, f1371, 0f3F7B14BE;
sub.f32 f1401, f1569, f1400;
mul.f32 f1402, f1576, 0f3F7B14BE;
fma.rn.f32 f1403, f1371, 0fBE47C5C2, f1402;
mul.f32 f1405, f1575, 0fBEC3EF15;
mul.f32 f1568, f1375, 0f3F6C835E;
sub.f32 f1406, f1568, f1405;
mul.f32 f1407, f1575, 0f3F6C835E;
fma.rn.f32 f1408, f1375, 0fBEC3EF15, f1407;
mul.f32 f1566, f1379, 0f3F54DB31;
mul.f32 f1567, f1574, 0fBF0E39DA;
sub.f32 f1411, f1566, f1567;
mul.f32 f1412, f1574, 0f3F54DB31;
fma.rn.f32 f1413, f1379, 0fBF0E39DA, f1412;
mul.f32 f1564, f1383, 0f3F3504F3;
mul.f32 f1565, f1573, 0fBF3504F3;
sub.f32 f1416, f1564, f1565;
mul.f32 f1417, f1573, 0f3F3504F3;
fma.rn.f32 f1418, f1383, 0fBF3504F3, f1417;
mul.f32 f1562, f1387, 0f3F0E39DA;
mul.f32 f1563, f1572, 0fBF54DB31;
sub.f32 f1421, f1562, f1563;
mul.f32 f1422, f1572, 0f3F0E39DA;
fma.rn.f32 f1423, f1387, 0fBF54DB31, f1422;
mul.f32 f1560, f1391, 0f3EC3EF15;
mul.f32 f1561, f1571, 0fBF6C835E;
sub.f32 f1426, f1560, f1561;
mul.f32 f1427, f1571, 0f3EC3EF15;
fma.rn.f32 f1428, f1391, 0fBF6C835E, f1427;
mul.f32 f1430, f1570, 0fBF7B14BE;
mul.f32 f1559, f1395, 0f3E47C5C2;
sub.f32 f1431, f1559, f1430;
mul.f32 f1432, f1570, 0f3E47C5C2;
fma.rn.f32 f1433, f1395, 0fBF7B14BE, f1432;
mul.f32 f1435, f1374, 0fBF7B14BE;
mul.f32 f1558, f1373, 0fBE47C5C2;
sub.f32 f1436, f1558, f1435;
mul.f32 f1437, f1374, 0fBE47C5C2;
fma.rn.f32 f1438, f1373, 0fBF7B14BE, f1437;
mul.f32 f1440, f1378, 0fBF6C835E;
mul.f32 f1557, f1377, 0fBEC3EF15;
sub.f32 f1441, f1557, f1440;
mul.f32 f1442, f1378, 0fBEC3EF15;
fma.rn.f32 f1443, f1377, 0fBF6C835E, f1442;
mul.f32 f1445, f1382, 0fBF54DB31;
mul.f32 f1556, f1381, 0fBF0E39DA;
sub.f32 f1446, f1556, f1445;
mul.f32 f1447, f1382, 0fBF0E39DA;
fma.rn.f32 f1448, f1381, 0fBF54DB31, f1447;
mul.f32 f1449, f1385, 0fBF3504F3;
mul.f32 f1450, f1386, 0fBF3504F3;
sub.f32 f1451, f1449, f1450;
add.f32 f1452, f1449, f1450;
mul.f32 f1454, f1390, 0fBF0E39DA;
mul.f32 f1555, f1389, 0fBF54DB31;
sub.f32 f1455, f1555, f1454;
mul.f32 f1456, f1390, 0fBF54DB31;
fma.rn.f32 f1457, f1389, 0fBF0E39DA, f1456;
mul.f32 f1459, f1394, 0fBEC3EF15;
mul.f32 f1554, f1393, 0fBF6C835E;
sub.f32 f1460, f1554, f1459;
mul.f32 f1461, f1394, 0fBF6C835E;
fma.rn.f32 f1462, f1393, 0fBEC3EF15, f1461;
mul.f32 f1464, f1398, 0fBE47C5C2;
mul.f32 f1553, f1397, 0fBF7B14BE;
sub.f32 f1465, f1553, f1464;
mul.f32 f1466, f1398, 0fBF7B14BE;
fma.rn.f32 f1467, f1397, 0fBE47C5C2, f1466;
add.f32 %0, f1192, f1367;
add.f32 %1, f1621, f1577;
add.f32 %2, f1196, f1401;
add.f32 %3, f1620, f1403;
add.f32 %4, f1200, f1406;
add.f32 %5, f1619, f1408;
add.f32 %6, f1204, f1411;
add.f32 %7, f1618, f1413;
add.f32 %9, f1617, f1418;
add.f32 %8, f1208, f1416;
add.f32 %11, f1616, f1423;
add.f32 %10, f1212, f1421;
add.f32 %12, f1216, f1426;
add.f32 %13, f1615, f1428;
add.f32 %14, f1220, f1431;
add.f32 %15, f1614, f1433;
add.f32 %16, f1194, f1370;
sub.f32 %17, f1195, f1369;
add.f32 %18, f1198, f1436;
add.f32 %19, f1199, f1438;
add.f32 %21, f1203, f1443;
add.f32 %20, f1202, f1441;
add.f32 %23, f1207, f1448;
add.f32 %22, f1206, f1446;
add.f32 %25, f1211, f1452;
add.f32 %24, f1210, f1451;
add.f32 %26, f1214, f1455;
add.f32 %27, f1215, f1457;
add.f32 %28, f1218, f1460;
add.f32 %29, f1219, f1462;
add.f32 %30, f1222, f1465;
add.f32 %31, f1223, f1467;
sub.f32 %33, f1621, f1577;
sub.f32 %32, f1192, f1367;
sub.f32 %35, f1620, f1403;
sub.f32 %34, f1196, f1401;
sub.f32 %37, f1619, f1408;
sub.f32 %36, f1200, f1406;
sub.f32 %39, f1618, f1413;
sub.f32 %38, f1204, f1411;
sub.f32 %41, f1617, f1418;
sub.f32 %40, f1208, f1416;
sub.f32 %43, f1616, f1423;
sub.f32 %42, f1212, f1421;
sub.f32 %45, f1615, f1428;
sub.f32 %44, f1216, f1426;
sub.f32 %47, f1614, f1433;
sub.f32 %46, f1220, f1431;
add.f32 %49, f1195, f1369;
sub.f32 %48, f1194, f1370;
sub.f32 %51, f1199, f1438;
sub.f32 %50, f1198, f1436;
sub.f32 %53, f1203, f1443;
sub.f32 %52, f1202, f1441;
sub.f32 %55, f1207, f1448;
sub.f32 %54, f1206, f1446;
sub.f32 %57, f1211, f1452;
sub.f32 %56, f1210, f1451;
sub.f32 %59, f1215, f1457;
sub.f32 %58, f1214, f1455;
sub.f32 %61, f1219, f1462;
sub.f32 %60, f1218, f1460;
sub.f32 %63, f1223, f1467;
sub.f32 %62, f1222, f1465;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_1024), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<81, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<841>;
.reg .b32 r<22>;
.reg .b64 rd<9>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 12;
mov.u32 r3, %32;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f32 f65, %35, %56;
add.f32 f66, %36, %58;
sub.f32 f67, %35, %56;
sub.f32 f68, %36, %58;
add.f32 f69, %45, %67;
add.f32 f70, %47, %68;
sub.f32 f71, %45, %67;
sub.f32 f72, %47, %68;
add.f32 f73, f65, f69;
add.f32 f74, f66, f70;
sub.f32 f75, f65, f69;
sub.f32 f76, f66, f70;
add.f32 f77, f67, f72;
sub.f32 f78, f68, f71;
sub.f32 f79, f67, f72;
add.f32 f80, f68, f71;
add.f32 f81, %40, %61;
add.f32 f82, %42, %63;
sub.f32 f83, %40, %61;
sub.f32 f84, %42, %63;
add.f32 f85, %51, %72;
add.f32 f86, %52, %74;
sub.f32 f87, %51, %72;
sub.f32 f88, %52, %74;
add.f32 f89, f81, f85;
add.f32 f90, f82, f86;
sub.f32 f91, f81, f85;
sub.f32 f92, f82, f86;
add.f32 f93, f83, f88;
sub.f32 f94, f84, f87;
sub.f32 f95, f83, f88;
add.f32 f96, f84, f87;
mul.f32 f97, f93, 0f3F3504F3;
mul.f32 f98, f94, 0fBF3504F3;
sub.f32 f99, f97, f98;
mul.f32 f100, f94, 0f3F3504F3;
fma.rn.f32 f101, f93, 0fBF3504F3, f100;
mul.f32 f102, f95, 0fBF3504F3;
mul.f32 f103, f96, 0fBF3504F3;
sub.f32 f104, f102, f103;
add.f32 f105, f102, f103;
add.f32 f106, f73, f89;
add.f32 f107, f74, f90;
sub.f32 f108, f73, f89;
sub.f32 f109, f74, f90;
add.f32 f110, f77, f99;
add.f32 f111, f78, f101;
sub.f32 f112, f77, f99;
sub.f32 f113, f78, f101;
add.f32 f114, f75, f92;
sub.f32 f115, f76, f91;
sub.f32 f116, f75, f92;
add.f32 f117, f76, f91;
add.f32 f118, f79, f104;
add.f32 f119, f80, f105;
sub.f32 f120, f79, f104;
sub.f32 f121, f80, f105;
add.f32 f122, %37, %59;
add.f32 f123, %39, %60;
sub.f32 f124, %37, %59;
sub.f32 f125, %39, %60;
add.f32 f126, %48, %69;
add.f32 f127, %50, %71;
sub.f32 f128, %48, %69;
sub.f32 f129, %50, %71;
add.f32 f130, f122, f126;
add.f32 f131, f123, f127;
sub.f32 f132, f122, f126;
sub.f32 f133, f123, f127;
add.f32 f134, f124, f129;
sub.f32 f135, f125, f128;
sub.f32 f136, f124, f129;
add.f32 f137, f125, f128;
add.f32 f138, %43, %64;
add.f32 f139, %44, %66;
sub.f32 f140, %43, %64;
sub.f32 f141, %44, %66;
add.f32 f142, %53, %75;
add.f32 f143, %55, %76;
sub.f32 f144, %53, %75;
sub.f32 f145, %55, %76;
add.f32 f146, f138, f142;
add.f32 f147, f139, f143;
sub.f32 f148, f138, f142;
sub.f32 f149, f139, f143;
add.f32 f150, f140, f145;
sub.f32 f151, f141, f144;
sub.f32 f152, f140, f145;
add.f32 f153, f141, f144;
mul.f32 f154, f150, 0f3F3504F3;
mul.f32 f155, f151, 0fBF3504F3;
sub.f32 f156, f154, f155;
mul.f32 f157, f151, 0f3F3504F3;
fma.rn.f32 f158, f150, 0fBF3504F3, f157;
mul.f32 f159, f152, 0fBF3504F3;
mul.f32 f160, f153, 0fBF3504F3;
sub.f32 f161, f159, f160;
add.f32 f162, f159, f160;
add.f32 f163, f130, f146;
add.f32 f164, f131, f147;
sub.f32 f165, f130, f146;
sub.f32 f166, f131, f147;
add.f32 f167, f134, f156;
add.f32 f168, f135, f158;
sub.f32 f169, f134, f156;
sub.f32 f170, f135, f158;
add.f32 f171, f132, f149;
sub.f32 f172, f133, f148;
sub.f32 f173, f132, f149;
add.f32 f174, f133, f148;
add.f32 f175, f136, f161;
add.f32 f176, f137, f162;
sub.f32 f177, f136, f161;
sub.f32 f178, f137, f162;
mul.f32 f179, f167, 0f3F6C835E;
mul.f32 f180, f168, 0fBEC3EF15;
sub.f32 f181, f179, f180;
mul.f32 f182, f168, 0f3F6C835E;
fma.rn.f32 f183, f167, 0fBEC3EF15, f182;
mul.f32 f184, f171, 0f3F3504F3;
mul.f32 f185, f172, 0fBF3504F3;
sub.f32 f186, f184, f185;
mul.f32 f187, f172, 0f3F3504F3;
fma.rn.f32 f188, f171, 0fBF3504F3, f187;
mul.f32 f189, f175, 0f3EC3EF15;
mul.f32 f190, f176, 0fBF6C835E;
sub.f32 f191, f189, f190;
mul.f32 f192, f176, 0f3EC3EF15;
fma.rn.f32 f193, f175, 0fBF6C835E, f192;
mul.f32 f194, f169, 0fBEC3EF15;
mul.f32 f195, f170, 0fBF6C835E;
sub.f32 f196, f194, f195;
mul.f32 f197, f170, 0fBEC3EF15;
fma.rn.f32 f198, f169, 0fBF6C835E, f197;
mul.f32 f199, f173, 0fBF3504F3;
mul.f32 f200, f174, 0fBF3504F3;
sub.f32 f201, f199, f200;
add.f32 f202, f199, f200;
mul.f32 f203, f177, 0fBF6C835E;
mul.f32 f204, f178, 0fBEC3EF15;
sub.f32 f205, f203, f204;
mul.f32 f206, f178, 0fBF6C835E;
fma.rn.f32 f207, f177, 0fBEC3EF15, f206;
add.f32 f208, f106, f163;
add.f32 f209, f107, f164;
sub.f32 f210, f106, f163;
sub.f32 f211, f107, f164;
add.f32 f212, f110, f181;
add.f32 f213, f111, f183;
sub.f32 f214, f110, f181;
sub.f32 f215, f111, f183;
add.f32 f216, f114, f186;
add.f32 f217, f115, f188;
sub.f32 f218, f114, f186;
sub.f32 f219, f115, f188;
add.f32 f220, f118, f191;
add.f32 f221, f119, f193;
sub.f32 f222, f118, f191;
sub.f32 f223, f119, f193;
add.f32 f224, f108, f166;
sub.f32 f225, f109, f165;
sub.f32 f226, f108, f166;
add.f32 f227, f109, f165;
add.f32 f228, f112, f196;
add.f32 f229, f113, f198;
sub.f32 f230, f112, f196;
sub.f32 f231, f113, f198;
add.f32 f232, f116, f201;
add.f32 f233, f117, f202;
sub.f32 f234, f116, f201;
sub.f32 f235, f117, f202;
add.f32 f236, f120, f205;
add.f32 f237, f121, f207;
sub.f32 f238, f120, f205;
sub.f32 f239, f121, f207;
and.b32 r6, r5, 63;
shl.b32 r7, r5, 3;
cvt.u64.u32 rd2, r7;
and.b64 rd3, rd2, 504;
mov.u64 rd4, %33;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f240, f241}, [rd5];
mul.f32 f244, f240, f212;
mul.f32 f245, f241, f213;
sub.f32 f246, f244, f245;
mul.f32 f247, f240, f213;
fma.rn.f32 f248, f241, f212, f247;
mul.f32 f249, f240, f240;
mul.f32 f250, f241, f241;
sub.f32 f251, f249, f250;
mul.f32 f252, f241, f240;
fma.rn.f32 f253, f241, f240, f252;
mul.f32 f254, f251, f216;
mul.f32 f255, f253, f217;
sub.f32 f256, f254, f255;
mul.f32 f257, f251, f217;
fma.rn.f32 f258, f253, f216, f257;
mul.f32 f259, f240, f251;
mul.f32 f260, f241, f253;
sub.f32 f261, f259, f260;
mul.f32 f262, f240, f253;
fma.rn.f32 f263, f241, f251, f262;
mul.f32 f264, f261, f220;
mul.f32 f265, f263, f221;
sub.f32 f266, f264, f265;
mul.f32 f267, f261, f221;
fma.rn.f32 f268, f263, f220, f267;
mul.f32 f269, f240, f261;
mul.f32 f270, f241, f263;
sub.f32 f271, f269, f270;
mul.f32 f272, f240, f263;
fma.rn.f32 f273, f241, f261, f272;
mul.f32 f274, f271, f224;
mul.f32 f275, f273, f225;
sub.f32 f276, f274, f275;
mul.f32 f277, f271, f225;
fma.rn.f32 f278, f273, f224, f277;
mul.f32 f279, f240, f271;
mul.f32 f280, f241, f273;
sub.f32 f281, f279, f280;
mul.f32 f282, f240, f273;
fma.rn.f32 f283, f241, f271, f282;
mul.f32 f284, f281, f228;
mul.f32 f285, f283, f229;
sub.f32 f286, f284, f285;
mul.f32 f287, f281, f229;
fma.rn.f32 f288, f283, f228, f287;
mul.f32 f289, f240, f281;
mul.f32 f290, f241, f283;
sub.f32 f291, f289, f290;
mul.f32 f292, f240, f283;
fma.rn.f32 f293, f241, f281, f292;
mul.f32 f294, f291, f232;
mul.f32 f295, f293, f233;
sub.f32 f296, f294, f295;
mul.f32 f297, f291, f233;
fma.rn.f32 f298, f293, f232, f297;
mul.f32 f299, f240, f291;
mul.f32 f300, f241, f293;
sub.f32 f301, f299, f300;
mul.f32 f302, f240, f293;
fma.rn.f32 f303, f241, f291, f302;
mul.f32 f304, f301, f236;
mul.f32 f305, f303, f237;
sub.f32 f306, f304, f305;
mul.f32 f307, f301, f237;
fma.rn.f32 f308, f303, f236, f307;
mul.f32 f309, f240, f301;
mul.f32 f310, f241, f303;
sub.f32 f311, f309, f310;
mul.f32 f312, f240, f303;
fma.rn.f32 f313, f241, f301, f312;
mul.f32 f314, f311, f210;
mul.f32 f315, f313, f211;
sub.f32 f316, f314, f315;
mul.f32 f317, f311, f211;
fma.rn.f32 f318, f313, f210, f317;
mul.f32 f319, f240, f311;
mul.f32 f320, f241, f313;
sub.f32 f321, f319, f320;
mul.f32 f322, f240, f313;
fma.rn.f32 f323, f241, f311, f322;
mul.f32 f324, f321, f214;
mul.f32 f325, f323, f215;
sub.f32 f326, f324, f325;
mul.f32 f327, f321, f215;
fma.rn.f32 f328, f323, f214, f327;
mul.f32 f329, f240, f321;
mul.f32 f330, f241, f323;
sub.f32 f331, f329, f330;
mul.f32 f332, f240, f323;
fma.rn.f32 f333, f241, f321, f332;
mul.f32 f334, f331, f218;
mul.f32 f335, f333, f219;
sub.f32 f336, f334, f335;
mul.f32 f337, f331, f219;
fma.rn.f32 f338, f333, f218, f337;
mul.f32 f339, f240, f331;
mul.f32 f340, f241, f333;
sub.f32 f341, f339, f340;
mul.f32 f342, f240, f333;
fma.rn.f32 f343, f241, f331, f342;
mul.f32 f344, f341, f222;
mul.f32 f345, f343, f223;
sub.f32 f346, f344, f345;
mul.f32 f347, f341, f223;
fma.rn.f32 f348, f343, f222, f347;
mul.f32 f349, f240, f341;
mul.f32 f350, f241, f343;
sub.f32 f351, f349, f350;
mul.f32 f352, f240, f343;
fma.rn.f32 f353, f241, f341, f352;
mul.f32 f354, f351, f226;
mul.f32 f355, f353, f227;
sub.f32 f356, f354, f355;
mul.f32 f357, f351, f227;
fma.rn.f32 f358, f353, f226, f357;
mul.f32 f359, f240, f351;
mul.f32 f360, f241, f353;
sub.f32 f361, f359, f360;
mul.f32 f362, f240, f353;
fma.rn.f32 f363, f241, f351, f362;
mul.f32 f364, f361, f230;
mul.f32 f365, f363, f231;
sub.f32 f366, f364, f365;
mul.f32 f367, f361, f231;
fma.rn.f32 f368, f363, f230, f367;
mul.f32 f369, f240, f361;
mul.f32 f370, f241, f363;
sub.f32 f371, f369, f370;
mul.f32 f372, f240, f363;
fma.rn.f32 f373, f241, f361, f372;
mul.f32 f374, f371, f234;
mul.f32 f375, f373, f235;
sub.f32 f376, f374, f375;
mul.f32 f377, f371, f235;
fma.rn.f32 f378, f373, f234, f377;
mul.f32 f379, f240, f371;
mul.f32 f380, f241, f373;
sub.f32 f381, f379, f380;
mul.f32 f382, f240, f373;
fma.rn.f32 f383, f241, f371, f382;
mul.f32 f384, f381, f238;
mul.f32 f385, f383, f239;
sub.f32 f386, f384, f385;
mul.f32 f387, f381, f239;
fma.rn.f32 f388, f383, f238, f387;
shl.b32 r8, r5, 6;
and.b32 r9, r8, -4096;
add.s32 r10, r4, r9;
barrier.sync 0;
and.b32 r11, r8, 4032;
add.s32 r12, r10, r11;
st.shared.v4.f32 [r12], {f208, f246, f256, f266};
st.shared.v4.f32 [r12+16], {f276, f286, f296, f306};
st.shared.v4.f32 [r12+32], {f316, f326, f336, f346};
st.shared.v4.f32 [r12+48], {f356, f366, f376, f386};
barrier.sync 0;
mad.lo.s32 r13, r6, -60, r12;
ld.shared.f32 f389, [r13];
ld.shared.f32 f390, [r13+256];
ld.shared.f32 f391, [r13+512];
ld.shared.f32 f392, [r13+768];
ld.shared.f32 f393, [r13+1024];
ld.shared.f32 f394, [r13+1280];
ld.shared.f32 f395, [r13+1536];
ld.shared.f32 f396, [r13+1792];
ld.shared.f32 f397, [r13+2048];
ld.shared.f32 f398, [r13+2304];
ld.shared.f32 f399, [r13+2560];
ld.shared.f32 f400, [r13+2816];
ld.shared.f32 f401, [r13+3072];
ld.shared.f32 f402, [r13+3328];
ld.shared.f32 f403, [r13+3584];
ld.shared.f32 f404, [r13+3840];
barrier.sync 0;
st.shared.v4.f32 [r12], {f209, f248, f258, f268};
st.shared.v4.f32 [r12+16], {f278, f288, f298, f308};
st.shared.v4.f32 [r12+32], {f318, f328, f338, f348};
st.shared.v4.f32 [r12+48], {f358, f368, f378, f388};
barrier.sync 0;
ld.shared.f32 f405, [r13];
ld.shared.f32 f406, [r13+256];
ld.shared.f32 f407, [r13+512];
ld.shared.f32 f408, [r13+768];
ld.shared.f32 f409, [r13+1024];
ld.shared.f32 f410, [r13+1280];
ld.shared.f32 f411, [r13+1536];
ld.shared.f32 f412, [r13+1792];
ld.shared.f32 f413, [r13+2048];
ld.shared.f32 f414, [r13+2304];
ld.shared.f32 f415, [r13+2560];
ld.shared.f32 f416, [r13+2816];
ld.shared.f32 f417, [r13+3072];
ld.shared.f32 f418, [r13+3328];
ld.shared.f32 f419, [r13+3584];
ld.shared.f32 f420, [r13+3840];
add.f32 f421, f389, f397;
add.f32 f422, f405, f413;
sub.f32 f423, f389, f397;
sub.f32 f424, f405, f413;
add.f32 f425, f393, f401;
add.f32 f426, f409, f417;
sub.f32 f427, f393, f401;
sub.f32 f428, f409, f417;
add.f32 f429, f421, f425;
add.f32 f430, f422, f426;
sub.f32 f431, f421, f425;
sub.f32 f432, f422, f426;
add.f32 f433, f423, f428;
sub.f32 f434, f424, f427;
sub.f32 f435, f423, f428;
add.f32 f436, f424, f427;
add.f32 f437, f391, f399;
add.f32 f438, f407, f415;
sub.f32 f439, f391, f399;
sub.f32 f440, f407, f415;
add.f32 f441, f395, f403;
add.f32 f442, f411, f419;
sub.f32 f443, f395, f403;
sub.f32 f444, f411, f419;
add.f32 f445, f437, f441;
add.f32 f446, f438, f442;
sub.f32 f447, f437, f441;
sub.f32 f448, f438, f442;
add.f32 f449, f439, f444;
sub.f32 f450, f440, f443;
sub.f32 f451, f439, f444;
add.f32 f452, f440, f443;
mul.f32 f453, f449, 0f3F3504F3;
mul.f32 f454, f450, 0fBF3504F3;
sub.f32 f455, f453, f454;
mul.f32 f456, f450, 0f3F3504F3;
fma.rn.f32 f457, f449, 0fBF3504F3, f456;
mul.f32 f458, f451, 0fBF3504F3;
mul.f32 f459, f452, 0fBF3504F3;
sub.f32 f460, f458, f459;
add.f32 f461, f458, f459;
add.f32 f462, f429, f445;
add.f32 f463, f430, f446;
sub.f32 f464, f429, f445;
sub.f32 f465, f430, f446;
add.f32 f466, f433, f455;
add.f32 f467, f434, f457;
sub.f32 f468, f433, f455;
sub.f32 f469, f434, f457;
add.f32 f470, f431, f448;
sub.f32 f471, f432, f447;
sub.f32 f472, f431, f448;
add.f32 f473, f432, f447;
add.f32 f474, f435, f460;
add.f32 f475, f436, f461;
sub.f32 f476, f435, f460;
sub.f32 f477, f436, f461;
add.f32 f478, f390, f398;
add.f32 f479, f406, f414;
sub.f32 f480, f390, f398;
sub.f32 f481, f406, f414;
add.f32 f482, f394, f402;
add.f32 f483, f410, f418;
sub.f32 f484, f394, f402;
sub.f32 f485, f410, f418;
add.f32 f486, f478, f482;
add.f32 f487, f479, f483;
sub.f32 f488, f478, f482;
sub.f32 f489, f479, f483;
add.f32 f490, f480, f485;
sub.f32 f491, f481, f484;
sub.f32 f492, f480, f485;
add.f32 f493, f481, f484;
add.f32 f494, f392, f400;
add.f32 f495, f408, f416;
sub.f32 f496, f392, f400;
sub.f32 f497, f408, f416;
add.f32 f498, f396, f404;
add.f32 f499, f412, f420;
sub.f32 f500, f396, f404;
sub.f32 f501, f412, f420;
add.f32 f502, f494, f498;
add.f32 f503, f495, f499;
sub.f32 f504, f494, f498;
sub.f32 f505, f495, f499;
add.f32 f506, f496, f501;
sub.f32 f507, f497, f500;
sub.f32 f508, f496, f501;
add.f32 f509, f497, f500;
mul.f32 f510, f506, 0f3F3504F3;
mul.f32 f511, f507, 0fBF3504F3;
sub.f32 f512, f510, f511;
mul.f32 f513, f507, 0f3F3504F3;
fma.rn.f32 f514, f506, 0fBF3504F3, f513;
mul.f32 f515, f508, 0fBF3504F3;
mul.f32 f516, f509, 0fBF3504F3;
sub.f32 f517, f515, f516;
add.f32 f518, f515, f516;
add.f32 f519, f486, f502;
add.f32 f520, f487, f503;
sub.f32 f521, f486, f502;
sub.f32 f522, f487, f503;
add.f32 f523, f490, f512;
add.f32 f524, f491, f514;
sub.f32 f525, f490, f512;
sub.f32 f526, f491, f514;
add.f32 f527, f488, f505;
sub.f32 f528, f489, f504;
sub.f32 f529, f488, f505;
add.f32 f530, f489, f504;
add.f32 f531, f492, f517;
add.f32 f532, f493, f518;
sub.f32 f533, f492, f517;
sub.f32 f534, f493, f518;
mul.f32 f535, f523, 0f3F6C835E;
mul.f32 f536, f524, 0fBEC3EF15;
sub.f32 f537, f535, f536;
mul.f32 f538, f524, 0f3F6C835E;
fma.rn.f32 f539, f523, 0fBEC3EF15, f538;
mul.f32 f540, f527, 0f3F3504F3;
mul.f32 f541, f528, 0fBF3504F3;
sub.f32 f542, f540, f541;
mul.f32 f543, f528, 0f3F3504F3;
fma.rn.f32 f544, f527, 0fBF3504F3, f543;
mul.f32 f545, f531, 0f3EC3EF15;
mul.f32 f546, f532, 0fBF6C835E;
sub.f32 f547, f545, f546;
mul.f32 f548, f532, 0f3EC3EF15;
fma.rn.f32 f549, f531, 0fBF6C835E, f548;
mul.f32 f550, f525, 0fBEC3EF15;
mul.f32 f551, f526, 0fBF6C835E;
sub.f32 f552, f550, f551;
mul.f32 f553, f526, 0fBEC3EF15;
fma.rn.f32 f554, f525, 0fBF6C835E, f553;
mul.f32 f555, f529, 0fBF3504F3;
mul.f32 f556, f530, 0fBF3504F3;
sub.f32 f557, f555, f556;
add.f32 f558, f555, f556;
mul.f32 f559, f533, 0fBF6C835E;
mul.f32 f560, f534, 0fBEC3EF15;
sub.f32 f561, f559, f560;
mul.f32 f562, f534, 0fBF6C835E;
fma.rn.f32 f563, f533, 0fBEC3EF15, f562;
add.f32 f564, f462, f519;
add.f32 f565, f463, f520;
sub.f32 f566, f462, f519;
sub.f32 f567, f463, f520;
add.f32 f568, f466, f537;
add.f32 f569, f467, f539;
sub.f32 f570, f466, f537;
sub.f32 f571, f467, f539;
add.f32 f572, f470, f542;
add.f32 f573, f471, f544;
sub.f32 f574, f470, f542;
sub.f32 f575, f471, f544;
add.f32 f576, f474, f547;
add.f32 f577, f475, f549;
sub.f32 f578, f474, f547;
sub.f32 f579, f475, f549;
add.f32 f580, f464, f522;
sub.f32 f581, f465, f521;
sub.f32 f582, f464, f522;
add.f32 f583, f465, f521;
add.f32 f584, f468, f552;
add.f32 f585, f469, f554;
sub.f32 f586, f468, f552;
sub.f32 f587, f469, f554;
add.f32 f588, f472, f557;
add.f32 f589, f473, f558;
sub.f32 f590, f472, f557;
sub.f32 f591, f473, f558;
add.f32 f592, f476, f561;
add.f32 f593, f477, f563;
sub.f32 f594, f476, f561;
sub.f32 f595, f477, f563;
and.b32 r14, r5, 48;
bfe.u32 r15, r5, 4, 2;
mul.wide.u32 rd6, r15, 8;
mov.u64 rd7, %34;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f596, f597}, [rd8];
mul.f32 f600, f596, f568;
mul.f32 f601, f597, f569;
sub.f32 f602, f600, f601;
mul.f32 f603, f596, f569;
fma.rn.f32 f604, f597, f568, f603;
mul.f32 f605, f596, f596;
mul.f32 f606, f597, f597;
sub.f32 f607, f605, f606;
mul.f32 f608, f597, f596;
fma.rn.f32 f609, f597, f596, f608;
mul.f32 f610, f607, f572;
mul.f32 f611, f609, f573;
sub.f32 f612, f610, f611;
mul.f32 f613, f607, f573;
fma.rn.f32 f614, f609, f572, f613;
mul.f32 f615, f596, f607;
mul.f32 f616, f597, f609;
sub.f32 f617, f615, f616;
mul.f32 f618, f596, f609;
fma.rn.f32 f619, f597, f607, f618;
mul.f32 f620, f617, f576;
mul.f32 f621, f619, f577;
sub.f32 f622, f620, f621;
mul.f32 f623, f617, f577;
fma.rn.f32 f624, f619, f576, f623;
mul.f32 f625, f596, f617;
mul.f32 f626, f597, f619;
sub.f32 f627, f625, f626;
mul.f32 f628, f596, f619;
fma.rn.f32 f629, f597, f617, f628;
mul.f32 f630, f627, f580;
mul.f32 f631, f629, f581;
sub.f32 f632, f630, f631;
mul.f32 f633, f627, f581;
fma.rn.f32 f634, f629, f580, f633;
mul.f32 f635, f596, f627;
mul.f32 f636, f597, f629;
sub.f32 f637, f635, f636;
mul.f32 f638, f596, f629;
fma.rn.f32 f639, f597, f627, f638;
mul.f32 f640, f637, f584;
mul.f32 f641, f639, f585;
sub.f32 f642, f640, f641;
mul.f32 f643, f637, f585;
fma.rn.f32 f644, f639, f584, f643;
mul.f32 f645, f596, f637;
mul.f32 f646, f597, f639;
sub.f32 f647, f645, f646;
mul.f32 f648, f596, f639;
fma.rn.f32 f649, f597, f637, f648;
mul.f32 f650, f647, f588;
mul.f32 f651, f649, f589;
sub.f32 f652, f650, f651;
mul.f32 f653, f647, f589;
fma.rn.f32 f654, f649, f588, f653;
mul.f32 f655, f596, f647;
mul.f32 f656, f597, f649;
sub.f32 f657, f655, f656;
mul.f32 f658, f596, f649;
fma.rn.f32 f659, f597, f647, f658;
mul.f32 f660, f657, f592;
mul.f32 f661, f659, f593;
sub.f32 f662, f660, f661;
mul.f32 f663, f657, f593;
fma.rn.f32 f664, f659, f592, f663;
mul.f32 f665, f596, f657;
mul.f32 f666, f597, f659;
sub.f32 f667, f665, f666;
mul.f32 f668, f596, f659;
fma.rn.f32 f669, f597, f657, f668;
mul.f32 f670, f667, f566;
mul.f32 f671, f669, f567;
sub.f32 f672, f670, f671;
mul.f32 f673, f667, f567;
fma.rn.f32 f674, f669, f566, f673;
mul.f32 f675, f596, f667;
mul.f32 f676, f597, f669;
sub.f32 f677, f675, f676;
mul.f32 f678, f596, f669;
fma.rn.f32 f679, f597, f667, f678;
mul.f32 f680, f677, f570;
mul.f32 f681, f679, f571;
sub.f32 f682, f680, f681;
mul.f32 f683, f677, f571;
fma.rn.f32 f684, f679, f570, f683;
mul.f32 f685, f596, f677;
mul.f32 f686, f597, f679;
sub.f32 f687, f685, f686;
mul.f32 f688, f596, f679;
fma.rn.f32 f689, f597, f677, f688;
mul.f32 f690, f687, f574;
mul.f32 f691, f689, f575;
sub.f32 f692, f690, f691;
mul.f32 f693, f687, f575;
fma.rn.f32 f694, f689, f574, f693;
mul.f32 f695, f596, f687;
mul.f32 f696, f597, f689;
sub.f32 f697, f695, f696;
mul.f32 f698, f596, f689;
fma.rn.f32 f699, f597, f687, f698;
mul.f32 f700, f697, f578;
mul.f32 f701, f699, f579;
sub.f32 f702, f700, f701;
mul.f32 f703, f697, f579;
fma.rn.f32 f704, f699, f578, f703;
mul.f32 f705, f596, f697;
mul.f32 f706, f597, f699;
sub.f32 f707, f705, f706;
mul.f32 f708, f596, f699;
fma.rn.f32 f709, f597, f697, f708;
mul.f32 f710, f707, f582;
mul.f32 f711, f709, f583;
sub.f32 f712, f710, f711;
mul.f32 f713, f707, f583;
fma.rn.f32 f714, f709, f582, f713;
mul.f32 f715, f596, f707;
mul.f32 f716, f597, f709;
sub.f32 f717, f715, f716;
mul.f32 f718, f596, f709;
fma.rn.f32 f719, f597, f707, f718;
mul.f32 f720, f717, f586;
mul.f32 f721, f719, f587;
sub.f32 f722, f720, f721;
mul.f32 f723, f717, f587;
fma.rn.f32 f724, f719, f586, f723;
mul.f32 f725, f596, f717;
mul.f32 f726, f597, f719;
sub.f32 f727, f725, f726;
mul.f32 f728, f596, f719;
fma.rn.f32 f729, f597, f717, f728;
mul.f32 f730, f727, f590;
mul.f32 f731, f729, f591;
sub.f32 f732, f730, f731;
mul.f32 f733, f727, f591;
fma.rn.f32 f734, f729, f590, f733;
mul.f32 f735, f596, f727;
mul.f32 f736, f597, f729;
sub.f32 f737, f735, f736;
mul.f32 f738, f596, f729;
fma.rn.f32 f739, f597, f727, f738;
mul.f32 f740, f737, f594;
mul.f32 f741, f739, f595;
sub.f32 f742, f740, f741;
mul.f32 f743, f737, f595;
fma.rn.f32 f744, f739, f594, f743;
shl.b32 r16, r5, 2;
and.b32 r17, r16, 60;
add.s32 r18, r10, r17;
barrier.sync 0;
and.b32 r19, r8, 3072;
add.s32 r20, r18, r19;
st.shared.f32 [r20], f564;
st.shared.f32 [r20+64], f602;
st.shared.f32 [r20+128], f612;
st.shared.f32 [r20+192], f622;
st.shared.f32 [r20+256], f632;
st.shared.f32 [r20+320], f642;
st.shared.f32 [r20+384], f652;
st.shared.f32 [r20+448], f662;
st.shared.f32 [r20+512], f672;
st.shared.f32 [r20+576], f682;
st.shared.f32 [r20+640], f692;
st.shared.f32 [r20+704], f702;
st.shared.f32 [r20+768], f712;
st.shared.f32 [r20+832], f722;
st.shared.f32 [r20+896], f732;
st.shared.f32 [r20+960], f742;
barrier.sync 0;
mad.lo.s32 r21, r14, -60, r20;
ld.shared.f32 f745, [r21];
ld.shared.f32 f746, [r21+256];
ld.shared.f32 f747, [r21+512];
ld.shared.f32 f748, [r21+768];
ld.shared.f32 f749, [r21+1024];
ld.shared.f32 f750, [r21+1280];
ld.shared.f32 f751, [r21+1536];
ld.shared.f32 f752, [r21+1792];
ld.shared.f32 f753, [r21+2048];
ld.shared.f32 f754, [r21+2304];
ld.shared.f32 f755, [r21+2560];
ld.shared.f32 f756, [r21+2816];
ld.shared.f32 f757, [r21+3072];
ld.shared.f32 f758, [r21+3328];
ld.shared.f32 f759, [r21+3584];
ld.shared.f32 f760, [r21+3840];
barrier.sync 0;
st.shared.f32 [r20], f565;
st.shared.f32 [r20+64], f604;
st.shared.f32 [r20+128], f614;
st.shared.f32 [r20+192], f624;
st.shared.f32 [r20+256], f634;
st.shared.f32 [r20+320], f644;
st.shared.f32 [r20+384], f654;
st.shared.f32 [r20+448], f664;
st.shared.f32 [r20+512], f674;
st.shared.f32 [r20+576], f684;
st.shared.f32 [r20+640], f694;
st.shared.f32 [r20+704], f704;
st.shared.f32 [r20+768], f714;
st.shared.f32 [r20+832], f724;
st.shared.f32 [r20+896], f734;
st.shared.f32 [r20+960], f744;
barrier.sync 0;
ld.shared.f32 f761, [r21];
ld.shared.f32 f762, [r21+256];
ld.shared.f32 f763, [r21+512];
ld.shared.f32 f764, [r21+768];
ld.shared.f32 f765, [r21+1024];
ld.shared.f32 f766, [r21+1280];
ld.shared.f32 f767, [r21+1536];
ld.shared.f32 f768, [r21+1792];
ld.shared.f32 f769, [r21+2048];
ld.shared.f32 f770, [r21+2304];
ld.shared.f32 f771, [r21+2560];
ld.shared.f32 f772, [r21+2816];
ld.shared.f32 f773, [r21+3072];
ld.shared.f32 f774, [r21+3328];
ld.shared.f32 f775, [r21+3584];
ld.shared.f32 f776, [r21+3840];
add.f32 f777, f745, f753;
add.f32 f778, f761, f769;
sub.f32 f779, f745, f753;
sub.f32 f780, f761, f769;
add.f32 f781, f749, f757;
add.f32 f782, f765, f773;
sub.f32 f783, f749, f757;
sub.f32 f784, f765, f773;
add.f32 f785, f746, f754;
add.f32 f786, f762, f770;
sub.f32 f787, f746, f754;
sub.f32 f788, f762, f770;
add.f32 f789, f750, f758;
add.f32 f790, f766, f774;
sub.f32 f791, f750, f758;
sub.f32 f792, f766, f774;
add.f32 f793, f747, f755;
add.f32 f794, f763, f771;
sub.f32 f795, f747, f755;
sub.f32 f796, f763, f771;
add.f32 f797, f751, f759;
add.f32 f798, f767, f775;
sub.f32 f799, f751, f759;
sub.f32 f800, f767, f775;
add.f32 f801, f748, f756;
add.f32 f802, f764, f772;
sub.f32 f803, f748, f756;
sub.f32 f804, f764, f772;
add.f32 f805, f752, f760;
add.f32 f806, f768, f776;
sub.f32 f807, f752, f760;
sub.f32 f808, f768, f776;
add.f32 %0, f777, f781;
add.f32 %1, f778, f782;
add.f32 %2, f785, f789;
add.f32 %3, f786, f790;
add.f32 %4, f793, f797;
add.f32 %5, f794, f798;
add.f32 %6, f801, f805;
add.f32 %7, f802, f806;
sub.f32 %9, f780, f783;
add.f32 %8, f779, f784;
sub.f32 %11, f788, f791;
add.f32 %10, f787, f792;
sub.f32 %13, f796, f799;
add.f32 %12, f795, f800;
sub.f32 %15, f804, f807;
add.f32 %14, f803, f808;
sub.f32 %16, f777, f781;
sub.f32 %17, f778, f782;
sub.f32 %18, f785, f789;
sub.f32 %19, f786, f790;
sub.f32 %20, f793, f797;
sub.f32 %21, f794, f798;
sub.f32 %22, f801, f805;
sub.f32 %23, f802, f806;
add.f32 %25, f780, f783;
sub.f32 %24, f779, f784;
add.f32 %27, f788, f791;
sub.f32 %26, f787, f792;
add.f32 %29, f796, f799;
sub.f32 %28, f795, f800;
add.f32 %31, f804, f807;
sub.f32 %30, f803, f808;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_1024), "l"(lut_sp_16_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<84, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<475>;
.reg .b32 r<28>;
.reg .b64 rd<12>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 12;
mov.u32 r3, %16;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f32 f33, %20, %30;
add.f32 f34, %21, %32;
sub.f32 f35, %20, %30;
sub.f32 f36, %21, %32;
add.f32 f37, %25, %36;
add.f32 f38, %27, %37;
sub.f32 f39, %25, %36;
sub.f32 f40, %27, %37;
add.f32 f41, f33, f37;
add.f32 f42, f34, f38;
sub.f32 f43, f33, f37;
sub.f32 f44, f34, f38;
add.f32 f45, f35, f40;
sub.f32 f46, f36, f39;
sub.f32 f47, f35, f40;
add.f32 f48, f36, f39;
add.f32 f49, %22, %33;
add.f32 f50, %24, %35;
sub.f32 f51, %22, %33;
sub.f32 f52, %24, %35;
add.f32 f53, %28, %38;
add.f32 f54, %29, %39;
sub.f32 f55, %28, %38;
sub.f32 f56, %29, %39;
add.f32 f57, f49, f53;
add.f32 f58, f50, f54;
sub.f32 f59, f49, f53;
sub.f32 f60, f50, f54;
add.f32 f61, f51, f56;
sub.f32 f62, f52, f55;
sub.f32 f63, f51, f56;
add.f32 f64, f52, f55;
mul.f32 f65, f61, 0f3F3504F3;
mul.f32 f66, f62, 0fBF3504F3;
sub.f32 f67, f65, f66;
mul.f32 f68, f62, 0f3F3504F3;
fma.rn.f32 f69, f61, 0fBF3504F3, f68;
mul.f32 f70, f63, 0fBF3504F3;
mul.f32 f71, f64, 0fBF3504F3;
sub.f32 f72, f70, f71;
add.f32 f73, f70, f71;
add.f32 f74, f41, f57;
add.f32 f75, f42, f58;
sub.f32 f76, f41, f57;
sub.f32 f77, f42, f58;
add.f32 f78, f45, f67;
add.f32 f79, f46, f69;
sub.f32 f80, f45, f67;
sub.f32 f81, f46, f69;
add.f32 f82, f43, f60;
sub.f32 f83, f44, f59;
sub.f32 f84, f43, f60;
add.f32 f85, f44, f59;
add.f32 f86, f47, f72;
add.f32 f87, f48, f73;
sub.f32 f88, f47, f72;
sub.f32 f89, f48, f73;
and.b32 r6, r5, 127;
shl.b32 r7, r5, 3;
cvt.u64.u32 rd2, r7;
and.b64 rd3, rd2, 1016;
mov.u64 rd4, %17;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f90, f91}, [rd5];
mul.f32 f94, f90, f78;
mul.f32 f95, f91, f79;
sub.f32 f96, f94, f95;
mul.f32 f97, f90, f79;
fma.rn.f32 f98, f91, f78, f97;
mul.f32 f99, f90, f90;
mul.f32 f100, f91, f91;
sub.f32 f101, f99, f100;
mul.f32 f102, f91, f90;
fma.rn.f32 f103, f91, f90, f102;
mul.f32 f104, f101, f82;
mul.f32 f105, f103, f83;
sub.f32 f106, f104, f105;
mul.f32 f107, f101, f83;
fma.rn.f32 f108, f103, f82, f107;
mul.f32 f109, f90, f101;
mul.f32 f110, f91, f103;
sub.f32 f111, f109, f110;
mul.f32 f112, f90, f103;
fma.rn.f32 f113, f91, f101, f112;
mul.f32 f114, f111, f86;
mul.f32 f115, f113, f87;
sub.f32 f116, f114, f115;
mul.f32 f117, f111, f87;
fma.rn.f32 f118, f113, f86, f117;
mul.f32 f119, f90, f111;
mul.f32 f120, f91, f113;
sub.f32 f121, f119, f120;
mul.f32 f122, f90, f113;
fma.rn.f32 f123, f91, f111, f122;
mul.f32 f124, f121, f76;
mul.f32 f125, f123, f77;
sub.f32 f126, f124, f125;
mul.f32 f127, f121, f77;
fma.rn.f32 f128, f123, f76, f127;
mul.f32 f129, f90, f121;
mul.f32 f130, f91, f123;
sub.f32 f131, f129, f130;
mul.f32 f132, f90, f123;
fma.rn.f32 f133, f91, f121, f132;
mul.f32 f134, f131, f80;
mul.f32 f135, f133, f81;
sub.f32 f136, f134, f135;
mul.f32 f137, f131, f81;
fma.rn.f32 f138, f133, f80, f137;
mul.f32 f139, f90, f131;
mul.f32 f140, f91, f133;
sub.f32 f141, f139, f140;
mul.f32 f142, f90, f133;
fma.rn.f32 f143, f91, f131, f142;
mul.f32 f144, f141, f84;
mul.f32 f145, f143, f85;
sub.f32 f146, f144, f145;
mul.f32 f147, f141, f85;
fma.rn.f32 f148, f143, f84, f147;
mul.f32 f149, f90, f141;
mul.f32 f150, f91, f143;
sub.f32 f151, f149, f150;
mul.f32 f152, f90, f143;
fma.rn.f32 f153, f91, f141, f152;
mul.f32 f154, f151, f88;
mul.f32 f155, f153, f89;
sub.f32 f156, f154, f155;
mul.f32 f157, f151, f89;
fma.rn.f32 f158, f153, f88, f157;
shl.b32 r8, r5, 5;
and.b32 r9, r8, -4096;
add.s32 r10, r4, r9;
barrier.sync 0;
and.b32 r11, r8, 4064;
add.s32 r12, r10, r11;
st.shared.v4.f32 [r12], {f74, f96, f106, f116};
st.shared.v4.f32 [r12+16], {f126, f136, f146, f156};
barrier.sync 0;
mad.lo.s32 r13, r6, -28, r12;
ld.shared.f32 f159, [r13];
ld.shared.f32 f160, [r13+512];
ld.shared.f32 f161, [r13+1024];
ld.shared.f32 f162, [r13+1536];
ld.shared.f32 f163, [r13+2048];
ld.shared.f32 f164, [r13+2560];
ld.shared.f32 f165, [r13+3072];
ld.shared.f32 f166, [r13+3584];
barrier.sync 0;
st.shared.v4.f32 [r12], {f75, f98, f108, f118};
st.shared.v4.f32 [r12+16], {f128, f138, f148, f158};
barrier.sync 0;
ld.shared.f32 f167, [r13];
ld.shared.f32 f168, [r13+512];
ld.shared.f32 f169, [r13+1024];
ld.shared.f32 f170, [r13+1536];
ld.shared.f32 f171, [r13+2048];
ld.shared.f32 f172, [r13+2560];
ld.shared.f32 f173, [r13+3072];
ld.shared.f32 f174, [r13+3584];
add.f32 f175, f159, f163;
add.f32 f176, f167, f171;
sub.f32 f177, f159, f163;
sub.f32 f178, f167, f171;
add.f32 f179, f161, f165;
add.f32 f180, f169, f173;
sub.f32 f181, f161, f165;
sub.f32 f182, f169, f173;
add.f32 f183, f175, f179;
add.f32 f184, f176, f180;
sub.f32 f185, f175, f179;
sub.f32 f186, f176, f180;
add.f32 f187, f177, f182;
sub.f32 f188, f178, f181;
sub.f32 f189, f177, f182;
add.f32 f190, f178, f181;
add.f32 f191, f160, f164;
add.f32 f192, f168, f172;
sub.f32 f193, f160, f164;
sub.f32 f194, f168, f172;
add.f32 f195, f162, f166;
add.f32 f196, f170, f174;
sub.f32 f197, f162, f166;
sub.f32 f198, f170, f174;
add.f32 f199, f191, f195;
add.f32 f200, f192, f196;
sub.f32 f201, f191, f195;
sub.f32 f202, f192, f196;
add.f32 f203, f193, f198;
sub.f32 f204, f194, f197;
sub.f32 f205, f193, f198;
add.f32 f206, f194, f197;
mul.f32 f207, f203, 0f3F3504F3;
mul.f32 f208, f204, 0fBF3504F3;
sub.f32 f209, f207, f208;
mul.f32 f210, f204, 0f3F3504F3;
fma.rn.f32 f211, f203, 0fBF3504F3, f210;
mul.f32 f212, f205, 0fBF3504F3;
mul.f32 f213, f206, 0fBF3504F3;
sub.f32 f214, f212, f213;
add.f32 f215, f212, f213;
add.f32 f216, f183, f199;
add.f32 f217, f184, f200;
sub.f32 f218, f183, f199;
sub.f32 f219, f184, f200;
add.f32 f220, f187, f209;
add.f32 f221, f188, f211;
sub.f32 f222, f187, f209;
sub.f32 f223, f188, f211;
add.f32 f224, f185, f202;
sub.f32 f225, f186, f201;
sub.f32 f226, f185, f202;
add.f32 f227, f186, f201;
add.f32 f228, f189, f214;
add.f32 f229, f190, f215;
sub.f32 f230, f189, f214;
sub.f32 f231, f190, f215;
and.b32 r14, r5, 120;
cvt.u64.u32 rd6, r14;
mov.u64 rd7, %18;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f232, f233}, [rd8];
mul.f32 f236, f232, f220;
mul.f32 f237, f233, f221;
sub.f32 f238, f236, f237;
mul.f32 f239, f232, f221;
fma.rn.f32 f240, f233, f220, f239;
mul.f32 f241, f232, f232;
mul.f32 f242, f233, f233;
sub.f32 f243, f241, f242;
mul.f32 f244, f233, f232;
fma.rn.f32 f245, f233, f232, f244;
mul.f32 f246, f243, f224;
mul.f32 f247, f245, f225;
sub.f32 f248, f246, f247;
mul.f32 f249, f243, f225;
fma.rn.f32 f250, f245, f224, f249;
mul.f32 f251, f232, f243;
mul.f32 f252, f233, f245;
sub.f32 f253, f251, f252;
mul.f32 f254, f232, f245;
fma.rn.f32 f255, f233, f243, f254;
mul.f32 f256, f253, f228;
mul.f32 f257, f255, f229;
sub.f32 f258, f256, f257;
mul.f32 f259, f253, f229;
fma.rn.f32 f260, f255, f228, f259;
mul.f32 f261, f232, f253;
mul.f32 f262, f233, f255;
sub.f32 f263, f261, f262;
mul.f32 f264, f232, f255;
fma.rn.f32 f265, f233, f253, f264;
mul.f32 f266, f263, f218;
mul.f32 f267, f265, f219;
sub.f32 f268, f266, f267;
mul.f32 f269, f263, f219;
fma.rn.f32 f270, f265, f218, f269;
mul.f32 f271, f232, f263;
mul.f32 f272, f233, f265;
sub.f32 f273, f271, f272;
mul.f32 f274, f232, f265;
fma.rn.f32 f275, f233, f263, f274;
mul.f32 f276, f273, f222;
mul.f32 f277, f275, f223;
sub.f32 f278, f276, f277;
mul.f32 f279, f273, f223;
fma.rn.f32 f280, f275, f222, f279;
mul.f32 f281, f232, f273;
mul.f32 f282, f233, f275;
sub.f32 f283, f281, f282;
mul.f32 f284, f232, f275;
fma.rn.f32 f285, f233, f273, f284;
mul.f32 f286, f283, f226;
mul.f32 f287, f285, f227;
sub.f32 f288, f286, f287;
mul.f32 f289, f283, f227;
fma.rn.f32 f290, f285, f226, f289;
mul.f32 f291, f232, f283;
mul.f32 f292, f233, f285;
sub.f32 f293, f291, f292;
mul.f32 f294, f232, f285;
fma.rn.f32 f295, f233, f283, f294;
mul.f32 f296, f293, f230;
mul.f32 f297, f295, f231;
sub.f32 f298, f296, f297;
mul.f32 f299, f293, f231;
fma.rn.f32 f300, f295, f230, f299;
shl.b32 r15, r5, 2;
and.b32 r16, r15, 28;
add.s32 r17, r10, r16;
barrier.sync 0;
and.b32 r18, r8, 3840;
add.s32 r19, r17, r18;
st.shared.f32 [r19], f216;
st.shared.f32 [r19+32], f238;
st.shared.f32 [r19+64], f248;
st.shared.f32 [r19+96], f258;
st.shared.f32 [r19+128], f268;
st.shared.f32 [r19+160], f278;
st.shared.f32 [r19+192], f288;
st.shared.f32 [r19+224], f298;
barrier.sync 0;
mad.lo.s32 r20, r14, -28, r19;
ld.shared.f32 f301, [r20];
ld.shared.f32 f302, [r20+512];
ld.shared.f32 f303, [r20+1024];
ld.shared.f32 f304, [r20+1536];
ld.shared.f32 f305, [r20+2048];
ld.shared.f32 f306, [r20+2560];
ld.shared.f32 f307, [r20+3072];
ld.shared.f32 f308, [r20+3584];
barrier.sync 0;
st.shared.f32 [r19], f217;
st.shared.f32 [r19+32], f240;
st.shared.f32 [r19+64], f250;
st.shared.f32 [r19+96], f260;
st.shared.f32 [r19+128], f270;
st.shared.f32 [r19+160], f280;
st.shared.f32 [r19+192], f290;
st.shared.f32 [r19+224], f300;
barrier.sync 0;
ld.shared.f32 f309, [r20];
ld.shared.f32 f310, [r20+512];
ld.shared.f32 f311, [r20+1024];
ld.shared.f32 f312, [r20+1536];
ld.shared.f32 f313, [r20+2048];
ld.shared.f32 f314, [r20+2560];
ld.shared.f32 f315, [r20+3072];
ld.shared.f32 f316, [r20+3584];
add.f32 f317, f301, f305;
add.f32 f318, f309, f313;
sub.f32 f319, f301, f305;
sub.f32 f320, f309, f313;
add.f32 f321, f303, f307;
add.f32 f322, f311, f315;
sub.f32 f323, f303, f307;
sub.f32 f324, f311, f315;
add.f32 f325, f317, f321;
add.f32 f326, f318, f322;
sub.f32 f327, f317, f321;
sub.f32 f328, f318, f322;
add.f32 f329, f319, f324;
sub.f32 f330, f320, f323;
sub.f32 f331, f319, f324;
add.f32 f332, f320, f323;
add.f32 f333, f302, f306;
add.f32 f334, f310, f314;
sub.f32 f335, f302, f306;
sub.f32 f336, f310, f314;
add.f32 f337, f304, f308;
add.f32 f338, f312, f316;
sub.f32 f339, f304, f308;
sub.f32 f340, f312, f316;
add.f32 f341, f333, f337;
add.f32 f342, f334, f338;
sub.f32 f343, f333, f337;
sub.f32 f344, f334, f338;
add.f32 f345, f335, f340;
sub.f32 f346, f336, f339;
sub.f32 f347, f335, f340;
add.f32 f348, f336, f339;
mul.f32 f349, f345, 0f3F3504F3;
mul.f32 f350, f346, 0fBF3504F3;
sub.f32 f351, f349, f350;
mul.f32 f352, f346, 0f3F3504F3;
fma.rn.f32 f353, f345, 0fBF3504F3, f352;
mul.f32 f354, f347, 0fBF3504F3;
mul.f32 f355, f348, 0fBF3504F3;
sub.f32 f356, f354, f355;
add.f32 f357, f354, f355;
add.f32 f358, f325, f341;
add.f32 f359, f326, f342;
sub.f32 f360, f325, f341;
sub.f32 f361, f326, f342;
add.f32 f362, f329, f351;
add.f32 f363, f330, f353;
sub.f32 f364, f329, f351;
sub.f32 f365, f330, f353;
add.f32 f366, f327, f344;
sub.f32 f367, f328, f343;
sub.f32 f368, f327, f344;
add.f32 f369, f328, f343;
add.f32 f370, f331, f356;
add.f32 f371, f332, f357;
sub.f32 f372, f331, f356;
sub.f32 f373, f332, f357;
and.b32 r21, r5, 64;
bfe.u32 r22, r5, 6, 1;
mul.wide.u32 rd9, r22, 8;
mov.u64 rd10, %19;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f374, f375}, [rd11];
mul.f32 f378, f374, f362;
mul.f32 f379, f375, f363;
sub.f32 f380, f378, f379;
mul.f32 f381, f374, f363;
fma.rn.f32 f382, f375, f362, f381;
mul.f32 f383, f374, f374;
mul.f32 f384, f375, f375;
sub.f32 f385, f383, f384;
mul.f32 f386, f375, f374;
fma.rn.f32 f387, f375, f374, f386;
mul.f32 f388, f385, f366;
mul.f32 f389, f387, f367;
sub.f32 f390, f388, f389;
mul.f32 f391, f385, f367;
fma.rn.f32 f392, f387, f366, f391;
mul.f32 f393, f374, f385;
mul.f32 f394, f375, f387;
sub.f32 f395, f393, f394;
mul.f32 f396, f374, f387;
fma.rn.f32 f397, f375, f385, f396;
mul.f32 f398, f395, f370;
mul.f32 f399, f397, f371;
sub.f32 f400, f398, f399;
mul.f32 f401, f395, f371;
fma.rn.f32 f402, f397, f370, f401;
mul.f32 f403, f374, f395;
mul.f32 f404, f375, f397;
sub.f32 f405, f403, f404;
mul.f32 f406, f374, f397;
fma.rn.f32 f407, f375, f395, f406;
mul.f32 f408, f405, f360;
mul.f32 f409, f407, f361;
sub.f32 f410, f408, f409;
mul.f32 f411, f405, f361;
fma.rn.f32 f412, f407, f360, f411;
mul.f32 f413, f374, f405;
mul.f32 f414, f375, f407;
sub.f32 f415, f413, f414;
mul.f32 f416, f374, f407;
fma.rn.f32 f417, f375, f405, f416;
mul.f32 f418, f415, f364;
mul.f32 f419, f417, f365;
sub.f32 f420, f418, f419;
mul.f32 f421, f415, f365;
fma.rn.f32 f422, f417, f364, f421;
mul.f32 f423, f374, f415;
mul.f32 f424, f375, f417;
sub.f32 f425, f423, f424;
mul.f32 f426, f374, f417;
fma.rn.f32 f427, f375, f415, f426;
mul.f32 f428, f425, f368;
mul.f32 f429, f427, f369;
sub.f32 f430, f428, f429;
mul.f32 f431, f425, f369;
fma.rn.f32 f432, f427, f368, f431;
mul.f32 f433, f374, f425;
mul.f32 f434, f375, f427;
sub.f32 f435, f433, f434;
mul.f32 f436, f374, f427;
fma.rn.f32 f437, f375, f425, f436;
mul.f32 f438, f435, f372;
mul.f32 f439, f437, f373;
sub.f32 f440, f438, f439;
mul.f32 f441, f435, f373;
fma.rn.f32 f442, f437, f372, f441;
and.b32 r23, r15, 252;
add.s32 r24, r10, r23;
barrier.sync 0;
and.b32 r25, r8, 2048;
add.s32 r26, r24, r25;
st.shared.f32 [r26], f358;
st.shared.f32 [r26+256], f380;
st.shared.f32 [r26+512], f390;
st.shared.f32 [r26+768], f400;
st.shared.f32 [r26+1024], f410;
st.shared.f32 [r26+1280], f420;
st.shared.f32 [r26+1536], f430;
st.shared.f32 [r26+1792], f440;
barrier.sync 0;
mad.lo.s32 r27, r21, -28, r26;
ld.shared.f32 f443, [r27];
ld.shared.f32 f444, [r27+512];
ld.shared.f32 f445, [r27+1024];
ld.shared.f32 f446, [r27+1536];
ld.shared.f32 f447, [r27+2048];
ld.shared.f32 f448, [r27+2560];
ld.shared.f32 f449, [r27+3072];
ld.shared.f32 f450, [r27+3584];
barrier.sync 0;
st.shared.f32 [r26], f359;
st.shared.f32 [r26+256], f382;
st.shared.f32 [r26+512], f392;
st.shared.f32 [r26+768], f402;
st.shared.f32 [r26+1024], f412;
st.shared.f32 [r26+1280], f422;
st.shared.f32 [r26+1536], f432;
st.shared.f32 [r26+1792], f442;
barrier.sync 0;
ld.shared.f32 f451, [r27];
ld.shared.f32 f452, [r27+512];
ld.shared.f32 f453, [r27+1024];
ld.shared.f32 f454, [r27+1536];
ld.shared.f32 f455, [r27+2048];
ld.shared.f32 f456, [r27+2560];
ld.shared.f32 f457, [r27+3072];
ld.shared.f32 f458, [r27+3584];
add.f32 %0, f443, f447;
add.f32 %1, f451, f455;
add.f32 %2, f444, f448;
add.f32 %3, f452, f456;
add.f32 %4, f445, f449;
add.f32 %5, f453, f457;
add.f32 %6, f446, f450;
add.f32 %7, f454, f458;
sub.f32 %8, f443, f447;
sub.f32 %9, f451, f455;
sub.f32 %10, f444, f448;
sub.f32 %11, f452, f456;
sub.f32 %12, f445, f449;
sub.f32 %13, f453, f457;
sub.f32 %14, f446, f450;
sub.f32 %15, f454, f458;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_1024), "l"(lut_sp_8_128), "l"(lut_sp_8_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<85, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<1821>;
.reg .b32 r<18>;
.reg .b64 rd<9>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 12;
mov.u32 r3, %64;
add.s32 r4, r3, r2;
add.f32 f129, %66, %98;
sub.f32 f131, %66, %98;
add.f32 f1819, %67, %130;
sub.f32 f132, %67, %130;
add.f32 f133, %82, %114;
sub.f32 f135, %82, %114;
add.f32 f1817, %131, %115;
sub.f32 f136, %131, %115;
add.f32 f137, f129, f133;
sub.f32 f139, f129, f133;
add.f32 f1816, f1819, f1817;
sub.f32 f140, f1819, f1817;
add.f32 f141, f131, f136;
sub.f32 f143, f131, f136;
sub.f32 f1815, f132, f135;
add.f32 f144, f132, f135;
add.f32 f145, %74, %106;
sub.f32 f147, %74, %106;
add.f32 f1812, %132, %133;
sub.f32 f148, %132, %133;
add.f32 f149, %90, %122;
sub.f32 f151, %90, %122;
add.f32 f1810, %91, %134;
sub.f32 f152, %91, %134;
add.f32 f153, f145, f149;
sub.f32 f155, f145, f149;
add.f32 f1809, f1812, f1810;
sub.f32 f156, f1812, f1810;
add.f32 f157, f147, f152;
sub.f32 f159, f147, f152;
sub.f32 f1808, f148, f151;
add.f32 f160, f148, f151;
mul.f32 f162, f1808, 0fBF3504F3;
mul.f32 f1807, f157, 0f3F3504F3;
sub.f32 f163, f1807, f162;
mul.f32 f164, f1808, 0f3F3504F3;
fma.rn.f32 f165, f157, 0fBF3504F3, f164;
mul.f32 f166, f159, 0fBF3504F3;
mul.f32 f167, f160, 0fBF3504F3;
sub.f32 f168, f166, f167;
add.f32 f169, f166, f167;
add.f32 f170, f137, f153;
sub.f32 f172, f137, f153;
add.f32 f1806, f1816, f1809;
sub.f32 f173, f1816, f1809;
add.f32 f174, f141, f163;
sub.f32 f176, f141, f163;
add.f32 f1805, f1815, f165;
sub.f32 f177, f1815, f165;
add.f32 f178, f139, f156;
sub.f32 f180, f139, f156;
sub.f32 f1804, f140, f155;
add.f32 f181, f140, f155;
add.f32 f182, f143, f168;
sub.f32 f184, f143, f168;
add.f32 f1803, f144, f169;
sub.f32 f185, f144, f169;
add.f32 f186, %70, %102;
sub.f32 f188, %70, %102;
add.f32 f1801, %135, %103;
sub.f32 f189, %135, %103;
add.f32 f190, %86, %118;
sub.f32 f192, %86, %118;
add.f32 f1798, %137, %136;
sub.f32 f193, %137, %136;
add.f32 f194, f186, f190;
sub.f32 f196, f186, f190;
add.f32 f1797, f1801, f1798;
sub.f32 f197, f1801, f1798;
add.f32 f198, f188, f193;
sub.f32 f200, f188, f193;
sub.f32 f1796, f189, f192;
add.f32 f201, f189, f192;
add.f32 f202, %78, %110;
sub.f32 f204, %78, %110;
add.f32 f1794, %79, %138;
sub.f32 f205, %79, %138;
add.f32 f206, %94, %126;
sub.f32 f208, %94, %126;
add.f32 f1792, %139, %127;
sub.f32 f209, %139, %127;
add.f32 f210, f202, f206;
sub.f32 f212, f202, f206;
add.f32 f1791, f1794, f1792;
sub.f32 f213, f1794, f1792;
add.f32 f214, f204, f209;
sub.f32 f216, f204, f209;
sub.f32 f1790, f205, f208;
add.f32 f217, f205, f208;
mul.f32 f219, f1790, 0fBF3504F3;
mul.f32 f1789, f214, 0f3F3504F3;
sub.f32 f220, f1789, f219;
mul.f32 f221, f1790, 0f3F3504F3;
fma.rn.f32 f222, f214, 0fBF3504F3, f221;
mul.f32 f223, f216, 0fBF3504F3;
mul.f32 f224, f217, 0fBF3504F3;
sub.f32 f225, f223, f224;
add.f32 f226, f223, f224;
add.f32 f227, f194, f210;
sub.f32 f229, f194, f210;
add.f32 f1788, f1797, f1791;
sub.f32 f230, f1797, f1791;
add.f32 f231, f198, f220;
sub.f32 f233, f198, f220;
add.f32 f1787, f1796, f222;
sub.f32 f234, f1796, f222;
add.f32 f235, f196, f213;
sub.f32 f237, f196, f213;
sub.f32 f1786, f197, f212;
add.f32 f238, f197, f212;
add.f32 f239, f200, f225;
sub.f32 f241, f200, f225;
add.f32 f1785, f201, f226;
sub.f32 f242, f201, f226;
mul.f32 f1783, f231, 0f3F6C835E;
mul.f32 f1784, f1787, 0fBEC3EF15;
sub.f32 f245, f1783, f1784;
mul.f32 f246, f1787, 0f3F6C835E;
fma.rn.f32 f247, f231, 0fBEC3EF15, f246;
mul.f32 f1781, f235, 0f3F3504F3;
mul.f32 f1782, f1786, 0fBF3504F3;
sub.f32 f250, f1781, f1782;
mul.f32 f251, f1786, 0f3F3504F3;
fma.rn.f32 f252, f235, 0fBF3504F3, f251;
mul.f32 f1779, f239, 0f3EC3EF15;
mul.f32 f1780, f1785, 0fBF6C835E;
sub.f32 f255, f1779, f1780;
mul.f32 f256, f1785, 0f3EC3EF15;
fma.rn.f32 f257, f239, 0fBF6C835E, f256;
mul.f32 f1777, f233, 0fBEC3EF15;
mul.f32 f1778, f234, 0fBF6C835E;
sub.f32 f260, f1777, f1778;
mul.f32 f261, f234, 0fBEC3EF15;
fma.rn.f32 f262, f233, 0fBF6C835E, f261;
mul.f32 f263, f237, 0fBF3504F3;
mul.f32 f264, f238, 0fBF3504F3;
sub.f32 f265, f263, f264;
add.f32 f266, f263, f264;
mul.f32 f1775, f241, 0fBF6C835E;
mul.f32 f1776, f242, 0fBEC3EF15;
sub.f32 f269, f1775, f1776;
mul.f32 f270, f242, 0fBF6C835E;
fma.rn.f32 f271, f241, 0fBEC3EF15, f270;
add.f32 f272, f170, f227;
sub.f32 f274, f170, f227;
add.f32 f1774, f1806, f1788;
sub.f32 f275, f1806, f1788;
add.f32 f276, f174, f245;
sub.f32 f278, f174, f245;
add.f32 f1773, f1805, f247;
sub.f32 f279, f1805, f247;
add.f32 f280, f178, f250;
sub.f32 f282, f178, f250;
add.f32 f1772, f1804, f252;
sub.f32 f283, f1804, f252;
add.f32 f284, f182, f255;
sub.f32 f286, f182, f255;
add.f32 f1771, f1803, f257;
sub.f32 f287, f1803, f257;
add.f32 f288, f172, f230;
sub.f32 f290, f172, f230;
sub.f32 f1770, f173, f229;
add.f32 f291, f173, f229;
add.f32 f292, f176, f260;
sub.f32 f294, f176, f260;
add.f32 f1769, f177, f262;
sub.f32 f295, f177, f262;
add.f32 f296, f180, f265;
sub.f32 f298, f180, f265;
add.f32 f1768, f181, f266;
sub.f32 f299, f181, f266;
add.f32 f300, f184, f269;
sub.f32 f302, f184, f269;
add.f32 f1767, f185, f271;
sub.f32 f303, f185, f271;
add.f32 f304, %68, %100;
sub.f32 f306, %68, %100;
add.f32 f1764, %141, %140;
sub.f32 f307, %141, %140;
add.f32 f308, %84, %116;
sub.f32 f310, %84, %116;
add.f32 f1762, %85, %142;
sub.f32 f311, %85, %142;
add.f32 f312, f304, f308;
sub.f32 f314, f304, f308;
add.f32 f1761, f1764, f1762;
sub.f32 f315, f1764, f1762;
add.f32 f316, f306, f311;
sub.f32 f318, f306, f311;
sub.f32 f1760, f307, f310;
add.f32 f319, f307, f310;
add.f32 f320, %76, %108;
sub.f32 f322, %76, %108;
add.f32 f1758, %143, %109;
sub.f32 f323, %143, %109;
add.f32 f324, %92, %124;
sub.f32 f326, %92, %124;
add.f32 f1755, %145, %144;
sub.f32 f327, %145, %144;
add.f32 f328, f320, f324;
sub.f32 f330, f320, f324;
add.f32 f1754, f1758, f1755;
sub.f32 f331, f1758, f1755;
add.f32 f332, f322, f327;
sub.f32 f334, f322, f327;
sub.f32 f1753, f323, f326;
add.f32 f335, f323, f326;
mul.f32 f1751, f332, 0f3F3504F3;
mul.f32 f1752, f1753, 0fBF3504F3;
sub.f32 f338, f1751, f1752;
mul.f32 f339, f1753, 0f3F3504F3;
fma.rn.f32 f340, f332, 0fBF3504F3, f339;
mul.f32 f341, f334, 0fBF3504F3;
mul.f32 f342, f335, 0fBF3504F3;
sub.f32 f343, f341, f342;
add.f32 f344, f341, f342;
add.f32 f345, f312, f328;
sub.f32 f347, f312, f328;
add.f32 f1750, f1761, f1754;
sub.f32 f348, f1761, f1754;
add.f32 f349, f316, f338;
sub.f32 f351, f316, f338;
add.f32 f1749, f1760, f340;
sub.f32 f352, f1760, f340;
add.f32 f353, f314, f331;
sub.f32 f355, f314, f331;
sub.f32 f1748, f315, f330;
add.f32 f356, f315, f330;
add.f32 f357, f318, f343;
sub.f32 f359, f318, f343;
add.f32 f1747, f319, f344;
sub.f32 f360, f319, f344;
add.f32 f361, %72, %104;
sub.f32 f363, %72, %104;
add.f32 f1745, %73, %146;
sub.f32 f364, %73, %146;
add.f32 f365, %88, %120;
sub.f32 f367, %88, %120;
add.f32 f1743, %147, %121;
sub.f32 f368, %147, %121;
add.f32 f369, f361, f365;
sub.f32 f371, f361, f365;
add.f32 f1742, f1745, f1743;
sub.f32 f372, f1745, f1743;
add.f32 f373, f363, f368;
sub.f32 f375, f363, f368;
sub.f32 f1741, f364, f367;
add.f32 f376, f364, f367;
add.f32 f377, %80, %112;
sub.f32 f379, %80, %112;
add.f32 f1738, %148, %149;
sub.f32 f380, %148, %149;
add.f32 f381, %96, %128;
sub.f32 f383, %96, %128;
add.f32 f1737, %97, %129;
sub.f32 f384, %97, %129;
add.f32 f385, f377, f381;
sub.f32 f387, f377, f381;
add.f32 f1736, f1738, f1737;
sub.f32 f388, f1738, f1737;
add.f32 f389, f379, f384;
sub.f32 f391, f379, f384;
sub.f32 f1735, f380, f383;
add.f32 f392, f380, f383;
mul.f32 f1733, f389, 0f3F3504F3;
mul.f32 f1734, f1735, 0fBF3504F3;
sub.f32 f395, f1733, f1734;
mul.f32 f396, f1735, 0f3F3504F3;
fma.rn.f32 f397, f389, 0fBF3504F3, f396;
mul.f32 f398, f391, 0fBF3504F3;
mul.f32 f399, f392, 0fBF3504F3;
sub.f32 f400, f398, f399;
add.f32 f401, f398, f399;
add.f32 f402, f369, f385;
sub.f32 f404, f369, f385;
add.f32 f1732, f1742, f1736;
sub.f32 f405, f1742, f1736;
add.f32 f406, f373, f395;
sub.f32 f408, f373, f395;
add.f32 f1731, f1741, f397;
sub.f32 f409, f1741, f397;
add.f32 f410, f371, f388;
sub.f32 f412, f371, f388;
sub.f32 f1730, f372, f387;
add.f32 f413, f372, f387;
add.f32 f414, f375, f400;
sub.f32 f416, f375, f400;
add.f32 f1729, f376, f401;
sub.f32 f417, f376, f401;
mul.f32 f419, f1731, 0fBEC3EF15;
mul.f32 f1728, f406, 0f3F6C835E;
sub.f32 f420, f1728, f419;
mul.f32 f421, f1731, 0f3F6C835E;
fma.rn.f32 f422, f406, 0fBEC3EF15, f421;
mul.f32 f424, f1730, 0fBF3504F3;
mul.f32 f1727, f410, 0f3F3504F3;
sub.f32 f425, f1727, f424;
mul.f32 f426, f1730, 0f3F3504F3;
fma.rn.f32 f427, f410, 0fBF3504F3, f426;
mul.f32 f1725, f414, 0f3EC3EF15;
mul.f32 f1726, f1729, 0fBF6C835E;
sub.f32 f430, f1725, f1726;
mul.f32 f431, f1729, 0f3EC3EF15;
fma.rn.f32 f432, f414, 0fBF6C835E, f431;
mul.f32 f1723, f408, 0fBEC3EF15;
mul.f32 f1724, f409, 0fBF6C835E;
sub.f32 f435, f1723, f1724;
mul.f32 f436, f409, 0fBEC3EF15;
fma.rn.f32 f437, f408, 0fBF6C835E, f436;
mul.f32 f438, f412, 0fBF3504F3;
mul.f32 f439, f413, 0fBF3504F3;
sub.f32 f440, f438, f439;
add.f32 f441, f438, f439;
mul.f32 f443, f417, 0fBEC3EF15;
mul.f32 f1722, f416, 0fBF6C835E;
sub.f32 f444, f1722, f443;
mul.f32 f445, f417, 0fBF6C835E;
fma.rn.f32 f446, f416, 0fBEC3EF15, f445;
add.f32 f447, f345, f402;
sub.f32 f449, f345, f402;
add.f32 f1721, f1750, f1732;
sub.f32 f450, f1750, f1732;
add.f32 f451, f349, f420;
sub.f32 f453, f349, f420;
add.f32 f1720, f1749, f422;
sub.f32 f454, f1749, f422;
add.f32 f455, f353, f425;
sub.f32 f457, f353, f425;
add.f32 f1719, f1748, f427;
sub.f32 f458, f1748, f427;
add.f32 f459, f357, f430;
sub.f32 f461, f357, f430;
add.f32 f1718, f1747, f432;
sub.f32 f462, f1747, f432;
add.f32 f463, f347, f405;
sub.f32 f465, f347, f405;
sub.f32 f1717, f348, f404;
add.f32 f466, f348, f404;
add.f32 f467, f351, f435;
sub.f32 f469, f351, f435;
add.f32 f1716, f352, f437;
sub.f32 f470, f352, f437;
add.f32 f471, f355, f440;
sub.f32 f473, f355, f440;
add.f32 f1715, f356, f441;
sub.f32 f474, f356, f441;
add.f32 f475, f359, f444;
sub.f32 f477, f359, f444;
add.f32 f1714, f360, f446;
sub.f32 f478, f360, f446;
mul.f32 f480, f1720, 0fBE47C5C2;
mul.f32 f1713, f451, 0f3F7B14BE;
sub.f32 f481, f1713, f480;
mul.f32 f482, f1720, 0f3F7B14BE;
fma.rn.f32 f483, f451, 0fBE47C5C2, f482;
mul.f32 f485, f1719, 0fBEC3EF15;
mul.f32 f1712, f455, 0f3F6C835E;
sub.f32 f486, f1712, f485;
mul.f32 f487, f1719, 0f3F6C835E;
fma.rn.f32 f488, f455, 0fBEC3EF15, f487;
mul.f32 f490, f1718, 0fBF0E39DA;
mul.f32 f1711, f459, 0f3F54DB31;
sub.f32 f491, f1711, f490;
mul.f32 f492, f1718, 0f3F54DB31;
fma.rn.f32 f493, f459, 0fBF0E39DA, f492;
mul.f32 f495, f1717, 0fBF3504F3;
mul.f32 f1710, f463, 0f3F3504F3;
sub.f32 f496, f1710, f495;
mul.f32 f497, f1717, 0f3F3504F3;
fma.rn.f32 f498, f463, 0fBF3504F3, f497;
mul.f32 f1708, f467, 0f3F0E39DA;
mul.f32 f1709, f1716, 0fBF54DB31;
sub.f32 f501, f1708, f1709;
mul.f32 f502, f1716, 0f3F0E39DA;
fma.rn.f32 f503, f467, 0fBF54DB31, f502;
mul.f32 f1706, f471, 0f3EC3EF15;
mul.f32 f1707, f1715, 0fBF6C835E;
sub.f32 f506, f1706, f1707;
mul.f32 f507, f1715, 0f3EC3EF15;
fma.rn.f32 f508, f471, 0fBF6C835E, f507;
mul.f32 f1704, f475, 0f3E47C5C2;
mul.f32 f1705, f1714, 0fBF7B14BE;
sub.f32 f511, f1704, f1705;
mul.f32 f512, f1714, 0f3E47C5C2;
fma.rn.f32 f513, f475, 0fBF7B14BE, f512;
mul.f32 f1702, f453, 0fBE47C5C2;
mul.f32 f1703, f454, 0fBF7B14BE;
sub.f32 f516, f1702, f1703;
mul.f32 f517, f454, 0fBE47C5C2;
fma.rn.f32 f518, f453, 0fBF7B14BE, f517;
mul.f32 f520, f458, 0fBF6C835E;
mul.f32 f1701, f457, 0fBEC3EF15;
sub.f32 f521, f1701, f520;
mul.f32 f522, f458, 0fBEC3EF15;
fma.rn.f32 f523, f457, 0fBF6C835E, f522;
mul.f32 f525, f462, 0fBF54DB31;
mul.f32 f1700, f461, 0fBF0E39DA;
sub.f32 f526, f1700, f525;
mul.f32 f527, f462, 0fBF0E39DA;
fma.rn.f32 f528, f461, 0fBF54DB31, f527;
mul.f32 f529, f465, 0fBF3504F3;
mul.f32 f530, f466, 0fBF3504F3;
sub.f32 f531, f529, f530;
add.f32 f532, f529, f530;
mul.f32 f1698, f469, 0fBF54DB31;
mul.f32 f1699, f470, 0fBF0E39DA;
sub.f32 f535, f1698, f1699;
mul.f32 f536, f470, 0fBF54DB31;
fma.rn.f32 f537, f469, 0fBF0E39DA, f536;
mul.f32 f539, f474, 0fBEC3EF15;
mul.f32 f1697, f473, 0fBF6C835E;
sub.f32 f540, f1697, f539;
mul.f32 f541, f474, 0fBF6C835E;
fma.rn.f32 f542, f473, 0fBEC3EF15, f541;
mul.f32 f544, f478, 0fBE47C5C2;
mul.f32 f1696, f477, 0fBF7B14BE;
sub.f32 f545, f1696, f544;
mul.f32 f546, f478, 0fBF7B14BE;
fma.rn.f32 f547, f477, 0fBE47C5C2, f546;
add.f32 f548, f272, f447;
sub.f32 f550, f272, f447;
add.f32 f1695, f1774, f1721;
sub.f32 f551, f1774, f1721;
add.f32 f552, f276, f481;
sub.f32 f554, f276, f481;
add.f32 f1694, f1773, f483;
sub.f32 f555, f1773, f483;
add.f32 f556, f280, f486;
sub.f32 f558, f280, f486;
add.f32 f1693, f1772, f488;
sub.f32 f559, f1772, f488;
add.f32 f560, f284, f491;
sub.f32 f562, f284, f491;
add.f32 f1692, f1771, f493;
sub.f32 f563, f1771, f493;
add.f32 f564, f288, f496;
sub.f32 f566, f288, f496;
add.f32 f1691, f1770, f498;
sub.f32 f567, f1770, f498;
add.f32 f568, f292, f501;
sub.f32 f570, f292, f501;
add.f32 f1690, f1769, f503;
sub.f32 f571, f1769, f503;
add.f32 f572, f296, f506;
sub.f32 f574, f296, f506;
add.f32 f1689, f1768, f508;
sub.f32 f575, f1768, f508;
add.f32 f576, f300, f511;
sub.f32 f578, f300, f511;
add.f32 f1688, f1767, f513;
sub.f32 f579, f1767, f513;
add.f32 f580, f274, f450;
sub.f32 f582, f274, f450;
sub.f32 f1687, f275, f449;
add.f32 f583, f275, f449;
add.f32 f584, f278, f516;
sub.f32 f586, f278, f516;
add.f32 f1686, f279, f518;
sub.f32 f587, f279, f518;
add.f32 f588, f282, f521;
sub.f32 f590, f282, f521;
add.f32 f1685, f283, f523;
sub.f32 f591, f283, f523;
add.f32 f592, f286, f526;
sub.f32 f594, f286, f526;
add.f32 f1684, f287, f528;
sub.f32 f595, f287, f528;
add.f32 f596, f290, f531;
sub.f32 f598, f290, f531;
add.f32 f1683, f291, f532;
sub.f32 f599, f291, f532;
add.f32 f600, f294, f535;
sub.f32 f602, f294, f535;
add.f32 f1682, f295, f537;
sub.f32 f603, f295, f537;
add.f32 f604, f298, f540;
sub.f32 f606, f298, f540;
add.f32 f1681, f299, f542;
sub.f32 f607, f299, f542;
add.f32 f608, f302, f545;
sub.f32 f610, f302, f545;
add.f32 f1680, f303, f547;
sub.f32 f611, f303, f547;
mov.u32 r15, %tid.x;
shl.b32 r7, r15, 3;
cvt.u64.u32 rd2, r7;
and.b64 rd3, rd2, 248;
mov.u64 rd4, %65;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f612, f613}, [rd5];
mul.f32 f617, f613, f1694;
mul.f32 f1679, f612, f552;
sub.f32 f618, f1679, f617;
mul.f32 f619, f612, f1694;
fma.rn.f32 f620, f613, f552, f619;
mul.f32 f1677, f612, f612;
mul.f32 f1678, f613, f613;
sub.f32 f623, f1677, f1678;
mul.f32 f624, f613, f612;
fma.rn.f32 f625, f613, f612, f624;
mul.f32 f1675, f623, f556;
mul.f32 f1676, f625, f1693;
sub.f32 f628, f1675, f1676;
mul.f32 f629, f623, f1693;
fma.rn.f32 f630, f625, f556, f629;
mul.f32 f1673, f612, f623;
mul.f32 f1674, f613, f625;
sub.f32 f633, f1673, f1674;
mul.f32 f634, f612, f625;
fma.rn.f32 f635, f613, f623, f634;
mul.f32 f1671, f633, f560;
mul.f32 f1672, f635, f1692;
sub.f32 f638, f1671, f1672;
mul.f32 f639, f633, f1692;
fma.rn.f32 f640, f635, f560, f639;
mul.f32 f642, f613, f635;
mul.f32 f1670, f612, f633;
sub.f32 f643, f1670, f642;
mul.f32 f644, f612, f635;
fma.rn.f32 f645, f613, f633, f644;
mul.f32 f647, f645, f1691;
mul.f32 f1669, f643, f564;
sub.f32 f648, f1669, f647;
mul.f32 f649, f643, f1691;
fma.rn.f32 f650, f645, f564, f649;
mul.f32 f652, f613, f645;
mul.f32 f1668, f612, f643;
sub.f32 f653, f1668, f652;
mul.f32 f654, f612, f645;
fma.rn.f32 f655, f613, f643, f654;
mul.f32 f657, f655, f1690;
mul.f32 f1667, f653, f568;
sub.f32 f658, f1667, f657;
mul.f32 f659, f653, f1690;
fma.rn.f32 f660, f655, f568, f659;
mul.f32 f662, f613, f655;
mul.f32 f1666, f612, f653;
sub.f32 f663, f1666, f662;
mul.f32 f664, f612, f655;
fma.rn.f32 f665, f613, f653, f664;
mul.f32 f1664, f663, f572;
mul.f32 f1665, f665, f1689;
sub.f32 f668, f1664, f1665;
mul.f32 f669, f663, f1689;
fma.rn.f32 f670, f665, f572, f669;
mul.f32 f1662, f612, f663;
mul.f32 f1663, f613, f665;
sub.f32 f673, f1662, f1663;
mul.f32 f674, f612, f665;
fma.rn.f32 f675, f613, f663, f674;
mul.f32 f1660, f673, f576;
mul.f32 f1661, f675, f1688;
sub.f32 f678, f1660, f1661;
mul.f32 f679, f673, f1688;
fma.rn.f32 f680, f675, f576, f679;
mul.f32 f1658, f612, f673;
mul.f32 f1659, f613, f675;
sub.f32 f683, f1658, f1659;
mul.f32 f684, f612, f675;
fma.rn.f32 f685, f613, f673, f684;
mul.f32 f687, f685, f1687;
mul.f32 f1657, f683, f580;
sub.f32 f688, f1657, f687;
mul.f32 f689, f683, f1687;
fma.rn.f32 f690, f685, f580, f689;
mul.f32 f692, f613, f685;
mul.f32 f1656, f612, f683;
sub.f32 f693, f1656, f692;
mul.f32 f694, f612, f685;
fma.rn.f32 f695, f613, f683, f694;
mul.f32 f697, f695, f1686;
mul.f32 f1655, f693, f584;
sub.f32 f698, f1655, f697;
mul.f32 f699, f693, f1686;
fma.rn.f32 f700, f695, f584, f699;
mul.f32 f702, f613, f695;
mul.f32 f1654, f612, f693;
sub.f32 f703, f1654, f702;
mul.f32 f704, f612, f695;
fma.rn.f32 f705, f613, f693, f704;
mul.f32 f707, f705, f1685;
mul.f32 f1653, f703, f588;
sub.f32 f708, f1653, f707;
mul.f32 f709, f703, f1685;
fma.rn.f32 f710, f705, f588, f709;
mul.f32 f1651, f612, f703;
mul.f32 f1652, f613, f705;
sub.f32 f713, f1651, f1652;
mul.f32 f714, f612, f705;
fma.rn.f32 f715, f613, f703, f714;
mul.f32 f1649, f713, f592;
mul.f32 f1650, f715, f1684;
sub.f32 f718, f1649, f1650;
mul.f32 f719, f713, f1684;
fma.rn.f32 f720, f715, f592, f719;
mul.f32 f1647, f612, f713;
mul.f32 f1648, f613, f715;
sub.f32 f723, f1647, f1648;
mul.f32 f724, f612, f715;
fma.rn.f32 f725, f613, f713, f724;
mul.f32 f1645, f723, f596;
mul.f32 f1646, f725, f1683;
sub.f32 f728, f1645, f1646;
mul.f32 f729, f723, f1683;
fma.rn.f32 f730, f725, f596, f729;
mul.f32 f732, f613, f725;
mul.f32 f1644, f612, f723;
sub.f32 f733, f1644, f732;
mul.f32 f734, f612, f725;
fma.rn.f32 f735, f613, f723, f734;
mul.f32 f737, f735, f1682;
mul.f32 f1643, f733, f600;
sub.f32 f738, f1643, f737;
mul.f32 f739, f733, f1682;
fma.rn.f32 f740, f735, f600, f739;
mul.f32 f742, f613, f735;
mul.f32 f1642, f612, f733;
sub.f32 f743, f1642, f742;
mul.f32 f744, f612, f735;
fma.rn.f32 f745, f613, f733, f744;
mul.f32 f747, f745, f1681;
mul.f32 f1641, f743, f604;
sub.f32 f748, f1641, f747;
mul.f32 f749, f743, f1681;
fma.rn.f32 f750, f745, f604, f749;
mul.f32 f752, f613, f745;
mul.f32 f1640, f612, f743;
sub.f32 f753, f1640, f752;
mul.f32 f754, f612, f745;
fma.rn.f32 f755, f613, f743, f754;
mul.f32 f757, f755, f1680;
mul.f32 f1639, f753, f608;
sub.f32 f758, f1639, f757;
mul.f32 f759, f753, f1680;
fma.rn.f32 f760, f755, f608, f759;
mul.f32 f1637, f612, f753;
mul.f32 f1638, f613, f755;
sub.f32 f763, f1637, f1638;
mul.f32 f764, f612, f755;
fma.rn.f32 f765, f613, f753, f764;
mul.f32 f1635, f763, f550;
mul.f32 f1636, f765, f551;
sub.f32 f768, f1635, f1636;
mul.f32 f769, f763, f551;
fma.rn.f32 f770, f765, f550, f769;
mul.f32 f1633, f612, f763;
mul.f32 f1634, f613, f765;
sub.f32 f773, f1633, f1634;
mul.f32 f774, f612, f765;
fma.rn.f32 f775, f613, f763, f774;
mul.f32 f777, f775, f555;
mul.f32 f1632, f773, f554;
sub.f32 f778, f1632, f777;
mul.f32 f779, f773, f555;
fma.rn.f32 f780, f775, f554, f779;
mul.f32 f782, f613, f775;
mul.f32 f1631, f612, f773;
sub.f32 f783, f1631, f782;
mul.f32 f784, f612, f775;
fma.rn.f32 f785, f613, f773, f784;
mul.f32 f787, f785, f559;
mul.f32 f1630, f783, f558;
sub.f32 f788, f1630, f787;
mul.f32 f789, f783, f559;
fma.rn.f32 f790, f785, f558, f789;
mul.f32 f792, f613, f785;
mul.f32 f1629, f612, f783;
sub.f32 f793, f1629, f792;
mul.f32 f794, f612, f785;
fma.rn.f32 f795, f613, f783, f794;
mul.f32 f797, f795, f563;
mul.f32 f1628, f793, f562;
sub.f32 f798, f1628, f797;
mul.f32 f799, f793, f563;
fma.rn.f32 f800, f795, f562, f799;
mul.f32 f802, f613, f795;
mul.f32 f1627, f612, f793;
sub.f32 f803, f1627, f802;
mul.f32 f804, f612, f795;
fma.rn.f32 f805, f613, f793, f804;
mul.f32 f1625, f803, f566;
mul.f32 f1626, f805, f567;
sub.f32 f808, f1625, f1626;
mul.f32 f809, f803, f567;
fma.rn.f32 f810, f805, f566, f809;
mul.f32 f1623, f612, f803;
mul.f32 f1624, f613, f805;
sub.f32 f813, f1623, f1624;
mul.f32 f814, f612, f805;
fma.rn.f32 f815, f613, f803, f814;
mul.f32 f1621, f813, f570;
mul.f32 f1622, f815, f571;
sub.f32 f818, f1621, f1622;
mul.f32 f819, f813, f571;
fma.rn.f32 f820, f815, f570, f819;
mul.f32 f1619, f612, f813;
mul.f32 f1620, f613, f815;
sub.f32 f823, f1619, f1620;
mul.f32 f824, f612, f815;
fma.rn.f32 f825, f613, f813, f824;
mul.f32 f827, f825, f575;
mul.f32 f1618, f823, f574;
sub.f32 f828, f1618, f827;
mul.f32 f829, f823, f575;
fma.rn.f32 f830, f825, f574, f829;
mul.f32 f832, f613, f825;
mul.f32 f1617, f612, f823;
sub.f32 f833, f1617, f832;
mul.f32 f834, f612, f825;
fma.rn.f32 f835, f613, f823, f834;
mul.f32 f837, f835, f579;
mul.f32 f1616, f833, f578;
sub.f32 f838, f1616, f837;
mul.f32 f839, f833, f579;
fma.rn.f32 f840, f835, f578, f839;
mul.f32 f842, f613, f835;
mul.f32 f1615, f612, f833;
sub.f32 f843, f1615, f842;
mul.f32 f844, f612, f835;
fma.rn.f32 f845, f613, f833, f844;
mul.f32 f847, f845, f583;
mul.f32 f1614, f843, f582;
sub.f32 f848, f1614, f847;
mul.f32 f849, f843, f583;
fma.rn.f32 f850, f845, f582, f849;
mul.f32 f1612, f612, f843;
mul.f32 f1613, f613, f845;
sub.f32 f853, f1612, f1613;
mul.f32 f854, f612, f845;
fma.rn.f32 f855, f613, f843, f854;
mul.f32 f1610, f853, f586;
mul.f32 f1611, f855, f587;
sub.f32 f858, f1610, f1611;
mul.f32 f859, f853, f587;
fma.rn.f32 f860, f855, f586, f859;
mul.f32 f1608, f612, f853;
mul.f32 f1609, f613, f855;
sub.f32 f863, f1608, f1609;
mul.f32 f864, f612, f855;
fma.rn.f32 f865, f613, f853, f864;
mul.f32 f1606, f863, f590;
mul.f32 f1607, f865, f591;
sub.f32 f868, f1606, f1607;
mul.f32 f869, f863, f591;
fma.rn.f32 f870, f865, f590, f869;
mul.f32 f872, f613, f865;
mul.f32 f1605, f612, f863;
sub.f32 f873, f1605, f872;
mul.f32 f874, f612, f865;
fma.rn.f32 f875, f613, f863, f874;
mul.f32 f877, f875, f595;
mul.f32 f1604, f873, f594;
sub.f32 f878, f1604, f877;
mul.f32 f879, f873, f595;
fma.rn.f32 f880, f875, f594, f879;
mul.f32 f882, f613, f875;
mul.f32 f1603, f612, f873;
sub.f32 f883, f1603, f882;
mul.f32 f884, f612, f875;
fma.rn.f32 f885, f613, f873, f884;
mul.f32 f887, f885, f599;
mul.f32 f1602, f883, f598;
sub.f32 f888, f1602, f887;
mul.f32 f889, f883, f599;
fma.rn.f32 f890, f885, f598, f889;
mul.f32 f892, f613, f885;
mul.f32 f1601, f612, f883;
sub.f32 f893, f1601, f892;
mul.f32 f894, f612, f885;
fma.rn.f32 f895, f613, f883, f894;
mul.f32 f1599, f893, f602;
mul.f32 f1600, f895, f603;
sub.f32 f898, f1599, f1600;
mul.f32 f899, f893, f603;
fma.rn.f32 f900, f895, f602, f899;
mul.f32 f1597, f612, f893;
mul.f32 f1598, f613, f895;
sub.f32 f903, f1597, f1598;
mul.f32 f904, f612, f895;
fma.rn.f32 f905, f613, f893, f904;
mul.f32 f1595, f903, f606;
mul.f32 f1596, f905, f607;
sub.f32 f908, f1595, f1596;
mul.f32 f909, f903, f607;
fma.rn.f32 f910, f905, f606, f909;
mul.f32 f1593, f612, f903;
mul.f32 f1594, f613, f905;
sub.f32 f913, f1593, f1594;
mul.f32 f914, f612, f905;
fma.rn.f32 f915, f613, f903, f914;
mul.f32 f917, f915, f611;
mul.f32 f1592, f913, f610;
sub.f32 f918, f1592, f917;
mov.u32 r17, %tid.x;
mul.f32 f919, f913, f611;
fma.rn.f32 f920, f915, f610, f919;
and.b32 r14, r17, 31;
shl.b32 r8, r17, 7;
and.b32 r9, r8, -4096;
add.s32 r10, r4, r9;
barrier.sync 0;
and.b32 r11, r8, 3968;
add.s32 r12, r10, r11;
st.shared.v4.f32 [r12], {f548, f618, f628, f638};
st.shared.v4.f32 [r12+16], {f648, f658, f668, f678};
st.shared.v4.f32 [r12+32], {f688, f698, f708, f718};
st.shared.v4.f32 [r12+48], {f728, f738, f748, f758};
st.shared.v4.f32 [r12+64], {f768, f778, f788, f798};
st.shared.v4.f32 [r12+80], {f808, f818, f828, f838};
st.shared.v4.f32 [r12+96], {f848, f858, f868, f878};
st.shared.v4.f32 [r12+112], {f888, f898, f908, f918};
barrier.sync 0;
mad.lo.s32 r13, r14, -124, r12;
ld.shared.f32 f921, [r13];
ld.shared.f32 f922, [r13+128];
ld.shared.f32 f923, [r13+256];
ld.shared.f32 f924, [r13+384];
ld.shared.f32 f925, [r13+512];
ld.shared.f32 f926, [r13+640];
ld.shared.f32 f927, [r13+768];
ld.shared.f32 f928, [r13+896];
ld.shared.f32 f929, [r13+1024];
ld.shared.f32 f930, [r13+1152];
ld.shared.f32 f931, [r13+1280];
ld.shared.f32 f932, [r13+1408];
ld.shared.f32 f933, [r13+1536];
ld.shared.f32 f934, [r13+1664];
ld.shared.f32 f935, [r13+1792];
ld.shared.f32 f936, [r13+1920];
ld.shared.f32 f937, [r13+2048];
ld.shared.f32 f938, [r13+2176];
ld.shared.f32 f939, [r13+2304];
ld.shared.f32 f940, [r13+2432];
ld.shared.f32 f941, [r13+2560];
ld.shared.f32 f942, [r13+2688];
ld.shared.f32 f943, [r13+2816];
ld.shared.f32 f944, [r13+2944];
ld.shared.f32 f945, [r13+3072];
ld.shared.f32 f946, [r13+3200];
ld.shared.f32 f947, [r13+3328];
ld.shared.f32 f948, [r13+3456];
ld.shared.f32 f949, [r13+3584];
ld.shared.f32 f950, [r13+3712];
ld.shared.f32 f951, [r13+3840];
ld.shared.f32 f952, [r13+3968];
barrier.sync 0;
st.shared.v4.f32 [r12], {f1695, f620, f630, f640};
st.shared.v4.f32 [r12+16], {f650, f660, f670, f680};
st.shared.v4.f32 [r12+32], {f690, f700, f710, f720};
st.shared.v4.f32 [r12+48], {f730, f740, f750, f760};
st.shared.v4.f32 [r12+64], {f770, f780, f790, f800};
st.shared.v4.f32 [r12+80], {f810, f820, f830, f840};
st.shared.v4.f32 [r12+96], {f850, f860, f870, f880};
st.shared.v4.f32 [r12+112], {f890, f900, f910, f920};
barrier.sync 0;
ld.shared.f32 f953, [r13];
ld.shared.f32 f954, [r13+128];
ld.shared.f32 f955, [r13+256];
ld.shared.f32 f956, [r13+384];
ld.shared.f32 f957, [r13+512];
ld.shared.f32 f958, [r13+640];
ld.shared.f32 f959, [r13+768];
ld.shared.f32 f960, [r13+896];
ld.shared.f32 f961, [r13+1024];
ld.shared.f32 f962, [r13+1152];
ld.shared.f32 f963, [r13+1280];
ld.shared.f32 f964, [r13+1408];
ld.shared.f32 f965, [r13+1536];
ld.shared.f32 f966, [r13+1664];
ld.shared.f32 f967, [r13+1792];
ld.shared.f32 f968, [r13+1920];
ld.shared.f32 f969, [r13+2048];
ld.shared.f32 f970, [r13+2176];
ld.shared.f32 f971, [r13+2304];
ld.shared.f32 f972, [r13+2432];
ld.shared.f32 f973, [r13+2560];
ld.shared.f32 f974, [r13+2688];
ld.shared.f32 f975, [r13+2816];
ld.shared.f32 f976, [r13+2944];
ld.shared.f32 f977, [r13+3072];
ld.shared.f32 f978, [r13+3200];
ld.shared.f32 f979, [r13+3328];
ld.shared.f32 f980, [r13+3456];
ld.shared.f32 f981, [r13+3584];
ld.shared.f32 f982, [r13+3712];
ld.shared.f32 f983, [r13+3840];
ld.shared.f32 f984, [r13+3968];
add.f32 f985, f921, f937;
sub.f32 f987, f921, f937;
add.f32 f1591, f953, f969;
sub.f32 f988, f953, f969;
add.f32 f989, f929, f945;
sub.f32 f991, f929, f945;
add.f32 f1590, f961, f977;
sub.f32 f992, f961, f977;
add.f32 f993, f985, f989;
sub.f32 f995, f985, f989;
add.f32 f1589, f1591, f1590;
sub.f32 f996, f1591, f1590;
add.f32 f997, f987, f992;
sub.f32 f999, f987, f992;
sub.f32 f1588, f988, f991;
add.f32 f1000, f988, f991;
add.f32 f1001, f925, f941;
sub.f32 f1003, f925, f941;
add.f32 f1587, f957, f973;
sub.f32 f1004, f957, f973;
add.f32 f1005, f933, f949;
sub.f32 f1007, f933, f949;
add.f32 f1586, f965, f981;
sub.f32 f1008, f965, f981;
add.f32 f1009, f1001, f1005;
sub.f32 f1011, f1001, f1005;
add.f32 f1585, f1587, f1586;
sub.f32 f1012, f1587, f1586;
add.f32 f1013, f1003, f1008;
sub.f32 f1015, f1003, f1008;
sub.f32 f1584, f1004, f1007;
add.f32 f1016, f1004, f1007;
mul.f32 f1018, f1584, 0fBF3504F3;
mul.f32 f1583, f1013, 0f3F3504F3;
sub.f32 f1019, f1583, f1018;
mul.f32 f1020, f1584, 0f3F3504F3;
fma.rn.f32 f1021, f1013, 0fBF3504F3, f1020;
mul.f32 f1022, f1015, 0fBF3504F3;
mul.f32 f1023, f1016, 0fBF3504F3;
sub.f32 f1024, f1022, f1023;
add.f32 f1025, f1022, f1023;
add.f32 f1026, f993, f1009;
sub.f32 f1028, f993, f1009;
add.f32 f1582, f1589, f1585;
sub.f32 f1029, f1589, f1585;
add.f32 f1030, f997, f1019;
sub.f32 f1032, f997, f1019;
add.f32 f1581, f1588, f1021;
sub.f32 f1033, f1588, f1021;
add.f32 f1034, f995, f1012;
sub.f32 f1036, f995, f1012;
sub.f32 f1580, f996, f1011;
add.f32 f1037, f996, f1011;
add.f32 f1038, f999, f1024;
sub.f32 f1040, f999, f1024;
add.f32 f1579, f1000, f1025;
sub.f32 f1041, f1000, f1025;
add.f32 f1042, f923, f939;
sub.f32 f1044, f923, f939;
add.f32 f1578, f955, f971;
sub.f32 f1045, f955, f971;
add.f32 f1046, f931, f947;
sub.f32 f1048, f931, f947;
add.f32 f1577, f963, f979;
sub.f32 f1049, f963, f979;
add.f32 f1050, f1042, f1046;
sub.f32 f1052, f1042, f1046;
add.f32 f1576, f1578, f1577;
sub.f32 f1053, f1578, f1577;
add.f32 f1054, f1044, f1049;
sub.f32 f1056, f1044, f1049;
sub.f32 f1575, f1045, f1048;
add.f32 f1057, f1045, f1048;
add.f32 f1058, f927, f943;
sub.f32 f1060, f927, f943;
add.f32 f1574, f959, f975;
sub.f32 f1061, f959, f975;
add.f32 f1062, f935, f951;
sub.f32 f1064, f935, f951;
add.f32 f1573, f967, f983;
sub.f32 f1065, f967, f983;
add.f32 f1066, f1058, f1062;
sub.f32 f1068, f1058, f1062;
add.f32 f1572, f1574, f1573;
sub.f32 f1069, f1574, f1573;
add.f32 f1070, f1060, f1065;
sub.f32 f1072, f1060, f1065;
sub.f32 f1571, f1061, f1064;
add.f32 f1073, f1061, f1064;
mul.f32 f1075, f1571, 0fBF3504F3;
mul.f32 f1570, f1070, 0f3F3504F3;
sub.f32 f1076, f1570, f1075;
mul.f32 f1077, f1571, 0f3F3504F3;
fma.rn.f32 f1078, f1070, 0fBF3504F3, f1077;
mul.f32 f1079, f1072, 0fBF3504F3;
mul.f32 f1080, f1073, 0fBF3504F3;
sub.f32 f1081, f1079, f1080;
add.f32 f1082, f1079, f1080;
add.f32 f1083, f1050, f1066;
sub.f32 f1085, f1050, f1066;
add.f32 f1569, f1576, f1572;
sub.f32 f1086, f1576, f1572;
add.f32 f1087, f1054, f1076;
sub.f32 f1089, f1054, f1076;
add.f32 f1568, f1575, f1078;
sub.f32 f1090, f1575, f1078;
add.f32 f1091, f1052, f1069;
sub.f32 f1093, f1052, f1069;
sub.f32 f1567, f1053, f1068;
add.f32 f1094, f1053, f1068;
add.f32 f1095, f1056, f1081;
sub.f32 f1097, f1056, f1081;
add.f32 f1566, f1057, f1082;
sub.f32 f1098, f1057, f1082;
mul.f32 f1564, f1087, 0f3F6C835E;
mul.f32 f1565, f1568, 0fBEC3EF15;
sub.f32 f1101, f1564, f1565;
mul.f32 f1102, f1568, 0f3F6C835E;
fma.rn.f32 f1103, f1087, 0fBEC3EF15, f1102;
mul.f32 f1562, f1091, 0f3F3504F3;
mul.f32 f1563, f1567, 0fBF3504F3;
sub.f32 f1106, f1562, f1563;
mul.f32 f1107, f1567, 0f3F3504F3;
fma.rn.f32 f1108, f1091, 0fBF3504F3, f1107;
mul.f32 f1110, f1566, 0fBF6C835E;
mul.f32 f1561, f1095, 0f3EC3EF15;
sub.f32 f1111, f1561, f1110;
mul.f32 f1112, f1566, 0f3EC3EF15;
fma.rn.f32 f1113, f1095, 0fBF6C835E, f1112;
mul.f32 f1115, f1090, 0fBF6C835E;
mul.f32 f1560, f1089, 0fBEC3EF15;
sub.f32 f1116, f1560, f1115;
mul.f32 f1117, f1090, 0fBEC3EF15;
fma.rn.f32 f1118, f1089, 0fBF6C835E, f1117;
mul.f32 f1119, f1093, 0fBF3504F3;
mul.f32 f1120, f1094, 0fBF3504F3;
sub.f32 f1121, f1119, f1120;
add.f32 f1122, f1119, f1120;
mul.f32 f1558, f1097, 0fBF6C835E;
mul.f32 f1559, f1098, 0fBEC3EF15;
sub.f32 f1125, f1558, f1559;
mul.f32 f1126, f1098, 0fBF6C835E;
fma.rn.f32 f1127, f1097, 0fBEC3EF15, f1126;
add.f32 f1128, f1026, f1083;
sub.f32 f1130, f1026, f1083;
add.f32 f1557, f1582, f1569;
sub.f32 f1131, f1582, f1569;
add.f32 f1132, f1030, f1101;
sub.f32 f1134, f1030, f1101;
add.f32 f1556, f1581, f1103;
sub.f32 f1135, f1581, f1103;
add.f32 f1136, f1034, f1106;
sub.f32 f1138, f1034, f1106;
add.f32 f1555, f1580, f1108;
sub.f32 f1139, f1580, f1108;
add.f32 f1140, f1038, f1111;
sub.f32 f1142, f1038, f1111;
add.f32 f1554, f1579, f1113;
sub.f32 f1143, f1579, f1113;
add.f32 f1144, f1028, f1086;
sub.f32 f1146, f1028, f1086;
sub.f32 f1553, f1029, f1085;
add.f32 f1147, f1029, f1085;
add.f32 f1148, f1032, f1116;
sub.f32 f1150, f1032, f1116;
add.f32 f1552, f1033, f1118;
sub.f32 f1151, f1033, f1118;
add.f32 f1152, f1036, f1121;
sub.f32 f1154, f1036, f1121;
add.f32 f1551, f1037, f1122;
sub.f32 f1155, f1037, f1122;
add.f32 f1156, f1040, f1125;
sub.f32 f1158, f1040, f1125;
add.f32 f1550, f1041, f1127;
sub.f32 f1159, f1041, f1127;
add.f32 f1160, f922, f938;
sub.f32 f1162, f922, f938;
add.f32 f1549, f954, f970;
sub.f32 f1163, f954, f970;
add.f32 f1164, f930, f946;
sub.f32 f1166, f930, f946;
add.f32 f1548, f962, f978;
sub.f32 f1167, f962, f978;
add.f32 f1168, f1160, f1164;
sub.f32 f1170, f1160, f1164;
add.f32 f1547, f1549, f1548;
sub.f32 f1171, f1549, f1548;
add.f32 f1172, f1162, f1167;
sub.f32 f1174, f1162, f1167;
sub.f32 f1546, f1163, f1166;
add.f32 f1175, f1163, f1166;
add.f32 f1176, f926, f942;
sub.f32 f1178, f926, f942;
add.f32 f1545, f958, f974;
sub.f32 f1179, f958, f974;
add.f32 f1180, f934, f950;
sub.f32 f1182, f934, f950;
add.f32 f1544, f966, f982;
sub.f32 f1183, f966, f982;
add.f32 f1184, f1176, f1180;
sub.f32 f1186, f1176, f1180;
add.f32 f1543, f1545, f1544;
sub.f32 f1187, f1545, f1544;
add.f32 f1188, f1178, f1183;
sub.f32 f1190, f1178, f1183;
sub.f32 f1542, f1179, f1182;
add.f32 f1191, f1179, f1182;
mul.f32 f1193, f1542, 0fBF3504F3;
mul.f32 f1541, f1188, 0f3F3504F3;
sub.f32 f1194, f1541, f1193;
mul.f32 f1195, f1542, 0f3F3504F3;
fma.rn.f32 f1196, f1188, 0fBF3504F3, f1195;
mul.f32 f1197, f1190, 0fBF3504F3;
mul.f32 f1198, f1191, 0fBF3504F3;
sub.f32 f1199, f1197, f1198;
add.f32 f1200, f1197, f1198;
add.f32 f1201, f1168, f1184;
sub.f32 f1203, f1168, f1184;
add.f32 f1540, f1547, f1543;
sub.f32 f1204, f1547, f1543;
add.f32 f1205, f1172, f1194;
sub.f32 f1207, f1172, f1194;
add.f32 f1539, f1546, f1196;
sub.f32 f1208, f1546, f1196;
add.f32 f1209, f1170, f1187;
sub.f32 f1211, f1170, f1187;
sub.f32 f1538, f1171, f1186;
add.f32 f1212, f1171, f1186;
add.f32 f1213, f1174, f1199;
sub.f32 f1215, f1174, f1199;
add.f32 f1537, f1175, f1200;
sub.f32 f1216, f1175, f1200;
add.f32 f1217, f924, f940;
sub.f32 f1219, f924, f940;
add.f32 f1536, f956, f972;
sub.f32 f1220, f956, f972;
add.f32 f1221, f932, f948;
sub.f32 f1223, f932, f948;
add.f32 f1535, f964, f980;
sub.f32 f1224, f964, f980;
add.f32 f1225, f1217, f1221;
sub.f32 f1227, f1217, f1221;
add.f32 f1534, f1536, f1535;
sub.f32 f1228, f1536, f1535;
add.f32 f1229, f1219, f1224;
sub.f32 f1231, f1219, f1224;
sub.f32 f1533, f1220, f1223;
add.f32 f1232, f1220, f1223;
add.f32 f1233, f928, f944;
sub.f32 f1235, f928, f944;
add.f32 f1532, f960, f976;
sub.f32 f1236, f960, f976;
add.f32 f1237, f936, f952;
sub.f32 f1239, f936, f952;
add.f32 f1531, f968, f984;
sub.f32 f1240, f968, f984;
add.f32 f1241, f1233, f1237;
sub.f32 f1243, f1233, f1237;
add.f32 f1530, f1532, f1531;
sub.f32 f1244, f1532, f1531;
add.f32 f1245, f1235, f1240;
sub.f32 f1247, f1235, f1240;
sub.f32 f1529, f1236, f1239;
add.f32 f1248, f1236, f1239;
mul.f32 f1250, f1529, 0fBF3504F3;
mul.f32 f1528, f1245, 0f3F3504F3;
sub.f32 f1251, f1528, f1250;
mul.f32 f1252, f1529, 0f3F3504F3;
fma.rn.f32 f1253, f1245, 0fBF3504F3, f1252;
mul.f32 f1254, f1247, 0fBF3504F3;
mul.f32 f1255, f1248, 0fBF3504F3;
sub.f32 f1256, f1254, f1255;
add.f32 f1257, f1254, f1255;
add.f32 f1258, f1225, f1241;
sub.f32 f1260, f1225, f1241;
add.f32 f1527, f1534, f1530;
sub.f32 f1261, f1534, f1530;
add.f32 f1262, f1229, f1251;
sub.f32 f1264, f1229, f1251;
add.f32 f1526, f1533, f1253;
sub.f32 f1265, f1533, f1253;
add.f32 f1266, f1227, f1244;
sub.f32 f1268, f1227, f1244;
sub.f32 f1525, f1228, f1243;
add.f32 f1269, f1228, f1243;
add.f32 f1270, f1231, f1256;
sub.f32 f1272, f1231, f1256;
add.f32 f1524, f1232, f1257;
sub.f32 f1273, f1232, f1257;
mul.f32 f1522, f1262, 0f3F6C835E;
mul.f32 f1523, f1526, 0fBEC3EF15;
sub.f32 f1276, f1522, f1523;
mul.f32 f1277, f1526, 0f3F6C835E;
fma.rn.f32 f1278, f1262, 0fBEC3EF15, f1277;
mul.f32 f1520, f1266, 0f3F3504F3;
mul.f32 f1521, f1525, 0fBF3504F3;
sub.f32 f1281, f1520, f1521;
mul.f32 f1282, f1525, 0f3F3504F3;
fma.rn.f32 f1283, f1266, 0fBF3504F3, f1282;
mul.f32 f1518, f1270, 0f3EC3EF15;
mul.f32 f1519, f1524, 0fBF6C835E;
sub.f32 f1286, f1518, f1519;
mul.f32 f1287, f1524, 0f3EC3EF15;
fma.rn.f32 f1288, f1270, 0fBF6C835E, f1287;
mul.f32 f1516, f1264, 0fBEC3EF15;
mul.f32 f1517, f1265, 0fBF6C835E;
sub.f32 f1291, f1516, f1517;
mul.f32 f1292, f1265, 0fBEC3EF15;
fma.rn.f32 f1293, f1264, 0fBF6C835E, f1292;
mul.f32 f1294, f1268, 0fBF3504F3;
mul.f32 f1295, f1269, 0fBF3504F3;
sub.f32 f1296, f1294, f1295;
add.f32 f1297, f1294, f1295;
mul.f32 f1514, f1272, 0fBF6C835E;
mul.f32 f1515, f1273, 0fBEC3EF15;
sub.f32 f1300, f1514, f1515;
mul.f32 f1301, f1273, 0fBF6C835E;
fma.rn.f32 f1302, f1272, 0fBEC3EF15, f1301;
add.f32 f1303, f1201, f1258;
sub.f32 f1305, f1201, f1258;
add.f32 f1513, f1540, f1527;
sub.f32 f1306, f1540, f1527;
add.f32 f1307, f1205, f1276;
sub.f32 f1309, f1205, f1276;
add.f32 f1512, f1539, f1278;
sub.f32 f1310, f1539, f1278;
add.f32 f1311, f1209, f1281;
sub.f32 f1313, f1209, f1281;
add.f32 f1511, f1538, f1283;
sub.f32 f1314, f1538, f1283;
add.f32 f1315, f1213, f1286;
sub.f32 f1317, f1213, f1286;
add.f32 f1510, f1537, f1288;
sub.f32 f1318, f1537, f1288;
add.f32 f1319, f1203, f1261;
sub.f32 f1321, f1203, f1261;
sub.f32 f1509, f1204, f1260;
add.f32 f1322, f1204, f1260;
add.f32 f1323, f1207, f1291;
sub.f32 f1325, f1207, f1291;
add.f32 f1508, f1208, f1293;
sub.f32 f1326, f1208, f1293;
add.f32 f1327, f1211, f1296;
sub.f32 f1329, f1211, f1296;
add.f32 f1507, f1212, f1297;
sub.f32 f1330, f1212, f1297;
add.f32 f1331, f1215, f1300;
sub.f32 f1333, f1215, f1300;
add.f32 f1506, f1216, f1302;
sub.f32 f1334, f1216, f1302;
mul.f32 f1336, f1512, 0fBE47C5C2;
mul.f32 f1505, f1307, 0f3F7B14BE;
sub.f32 f1337, f1505, f1336;
mul.f32 f1338, f1512, 0f3F7B14BE;
fma.rn.f32 f1339, f1307, 0fBE47C5C2, f1338;
mul.f32 f1341, f1511, 0fBEC3EF15;
mul.f32 f1504, f1311, 0f3F6C835E;
sub.f32 f1342, f1504, f1341;
mul.f32 f1343, f1511, 0f3F6C835E;
fma.rn.f32 f1344, f1311, 0fBEC3EF15, f1343;
mul.f32 f1502, f1315, 0f3F54DB31;
mul.f32 f1503, f1510, 0fBF0E39DA;
sub.f32 f1347, f1502, f1503;
mul.f32 f1348, f1510, 0f3F54DB31;
fma.rn.f32 f1349, f1315, 0fBF0E39DA, f1348;
mul.f32 f1500, f1319, 0f3F3504F3;
mul.f32 f1501, f1509, 0fBF3504F3;
sub.f32 f1352, f1500, f1501;
mul.f32 f1353, f1509, 0f3F3504F3;
fma.rn.f32 f1354, f1319, 0fBF3504F3, f1353;
mul.f32 f1498, f1323, 0f3F0E39DA;
mul.f32 f1499, f1508, 0fBF54DB31;
sub.f32 f1357, f1498, f1499;
mul.f32 f1358, f1508, 0f3F0E39DA;
fma.rn.f32 f1359, f1323, 0fBF54DB31, f1358;
mul.f32 f1496, f1327, 0f3EC3EF15;
mul.f32 f1497, f1507, 0fBF6C835E;
sub.f32 f1362, f1496, f1497;
mul.f32 f1363, f1507, 0f3EC3EF15;
fma.rn.f32 f1364, f1327, 0fBF6C835E, f1363;
mul.f32 f1366, f1506, 0fBF7B14BE;
mul.f32 f1495, f1331, 0f3E47C5C2;
sub.f32 f1367, f1495, f1366;
mul.f32 f1368, f1506, 0f3E47C5C2;
fma.rn.f32 f1369, f1331, 0fBF7B14BE, f1368;
mul.f32 f1371, f1310, 0fBF7B14BE;
mul.f32 f1494, f1309, 0fBE47C5C2;
sub.f32 f1372, f1494, f1371;
mul.f32 f1373, f1310, 0fBE47C5C2;
fma.rn.f32 f1374, f1309, 0fBF7B14BE, f1373;
mul.f32 f1376, f1314, 0fBF6C835E;
mul.f32 f1493, f1313, 0fBEC3EF15;
sub.f32 f1377, f1493, f1376;
mul.f32 f1378, f1314, 0fBEC3EF15;
fma.rn.f32 f1379, f1313, 0fBF6C835E, f1378;
mul.f32 f1381, f1318, 0fBF54DB31;
mul.f32 f1492, f1317, 0fBF0E39DA;
sub.f32 f1382, f1492, f1381;
mul.f32 f1383, f1318, 0fBF0E39DA;
fma.rn.f32 f1384, f1317, 0fBF54DB31, f1383;
mul.f32 f1385, f1321, 0fBF3504F3;
mul.f32 f1386, f1322, 0fBF3504F3;
sub.f32 f1387, f1385, f1386;
add.f32 f1388, f1385, f1386;
mul.f32 f1390, f1326, 0fBF0E39DA;
mul.f32 f1491, f1325, 0fBF54DB31;
sub.f32 f1391, f1491, f1390;
mul.f32 f1392, f1326, 0fBF54DB31;
fma.rn.f32 f1393, f1325, 0fBF0E39DA, f1392;
mul.f32 f1395, f1330, 0fBEC3EF15;
mul.f32 f1490, f1329, 0fBF6C835E;
sub.f32 f1396, f1490, f1395;
mul.f32 f1397, f1330, 0fBF6C835E;
fma.rn.f32 f1398, f1329, 0fBEC3EF15, f1397;
mul.f32 f1400, f1334, 0fBE47C5C2;
mul.f32 f1489, f1333, 0fBF7B14BE;
sub.f32 f1401, f1489, f1400;
mul.f32 f1402, f1334, 0fBF7B14BE;
fma.rn.f32 f1403, f1333, 0fBE47C5C2, f1402;
add.f32 %1, f1557, f1513;
add.f32 %0, f1128, f1303;
add.f32 %2, f1132, f1337;
add.f32 %3, f1556, f1339;
add.f32 %4, f1136, f1342;
add.f32 %5, f1555, f1344;
add.f32 %6, f1140, f1347;
add.f32 %7, f1554, f1349;
add.f32 %9, f1553, f1354;
add.f32 %8, f1144, f1352;
add.f32 %11, f1552, f1359;
add.f32 %10, f1148, f1357;
add.f32 %12, f1152, f1362;
add.f32 %13, f1551, f1364;
add.f32 %14, f1156, f1367;
add.f32 %15, f1550, f1369;
add.f32 %16, f1130, f1306;
sub.f32 %17, f1131, f1305;
add.f32 %18, f1134, f1372;
add.f32 %19, f1135, f1374;
add.f32 %21, f1139, f1379;
add.f32 %20, f1138, f1377;
add.f32 %23, f1143, f1384;
add.f32 %22, f1142, f1382;
add.f32 %25, f1147, f1388;
add.f32 %24, f1146, f1387;
add.f32 %26, f1150, f1391;
add.f32 %27, f1151, f1393;
add.f32 %28, f1154, f1396;
add.f32 %29, f1155, f1398;
add.f32 %30, f1158, f1401;
add.f32 %31, f1159, f1403;
sub.f32 %32, f1128, f1303;
sub.f32 %33, f1557, f1513;
sub.f32 %35, f1556, f1339;
sub.f32 %34, f1132, f1337;
sub.f32 %37, f1555, f1344;
sub.f32 %36, f1136, f1342;
sub.f32 %39, f1554, f1349;
sub.f32 %38, f1140, f1347;
sub.f32 %41, f1553, f1354;
sub.f32 %40, f1144, f1352;
sub.f32 %43, f1552, f1359;
sub.f32 %42, f1148, f1357;
sub.f32 %45, f1551, f1364;
sub.f32 %44, f1152, f1362;
sub.f32 %47, f1550, f1369;
sub.f32 %46, f1156, f1367;
add.f32 %49, f1131, f1305;
sub.f32 %48, f1130, f1306;
sub.f32 %51, f1135, f1374;
sub.f32 %50, f1134, f1372;
sub.f32 %53, f1139, f1379;
sub.f32 %52, f1138, f1377;
sub.f32 %55, f1143, f1384;
sub.f32 %54, f1142, f1382;
sub.f32 %57, f1147, f1388;
sub.f32 %56, f1146, f1387;
sub.f32 %59, f1151, f1393;
sub.f32 %58, f1150, f1391;
sub.f32 %61, f1155, f1398;
sub.f32 %60, f1154, f1396;
sub.f32 %63, f1159, f1403;
sub.f32 %62, f1158, f1401;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_1024), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<82, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<523>;
.reg .b32 r<27>;
.reg .b64 rd<12>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 13;
mov.u32 r3, %16;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f32 f33, %20, %30;
add.f32 f34, %21, %32;
sub.f32 f35, %20, %30;
sub.f32 f36, %21, %32;
add.f32 f37, %25, %36;
add.f32 f38, %27, %37;
sub.f32 f39, %25, %36;
sub.f32 f40, %27, %37;
add.f32 f41, f33, f37;
add.f32 f42, f34, f38;
sub.f32 f43, f33, f37;
sub.f32 f44, f34, f38;
add.f32 f45, f35, f40;
sub.f32 f46, f36, f39;
sub.f32 f47, f35, f40;
add.f32 f48, f36, f39;
add.f32 f49, %22, %33;
add.f32 f50, %24, %35;
sub.f32 f51, %22, %33;
sub.f32 f52, %24, %35;
add.f32 f53, %28, %38;
add.f32 f54, %29, %39;
sub.f32 f55, %28, %38;
sub.f32 f56, %29, %39;
add.f32 f57, f49, f53;
add.f32 f58, f50, f54;
sub.f32 f59, f49, f53;
sub.f32 f60, f50, f54;
add.f32 f61, f51, f56;
sub.f32 f62, f52, f55;
sub.f32 f63, f51, f56;
add.f32 f64, f52, f55;
mul.f32 f65, f61, 0f3F3504F3;
mul.f32 f66, f62, 0fBF3504F3;
sub.f32 f67, f65, f66;
mul.f32 f68, f62, 0f3F3504F3;
fma.rn.f32 f69, f61, 0fBF3504F3, f68;
mul.f32 f70, f63, 0fBF3504F3;
mul.f32 f71, f64, 0fBF3504F3;
sub.f32 f72, f70, f71;
add.f32 f73, f70, f71;
sub.f32 f74, f41, f57;
sub.f32 f75, f42, f58;
add.f32 f76, f45, f67;
add.f32 f77, f46, f69;
sub.f32 f78, f45, f67;
sub.f32 f79, f46, f69;
add.f32 f80, f43, f60;
sub.f32 f81, f44, f59;
sub.f32 f82, f43, f60;
add.f32 f83, f44, f59;
add.f32 f84, f47, f72;
add.f32 f85, f48, f73;
sub.f32 f86, f47, f72;
sub.f32 f87, f48, f73;
and.b32 r6, r5, 127;
shl.b32 r7, r5, 6;
and.b32 r8, r7, -8192;
add.s32 r9, r4, r8;
shl.b32 r10, r5, 3;
cvt.u64.u32 rd2, r10;
and.b64 rd3, rd2, 1016;
mov.u64 rd4, %17;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f88, f89}, [rd5];
mul.f32 f92, f88, f76;
mul.f32 f93, f89, f77;
mul.f32 f94, f88, f77;
mul.f32 f95, f88, f88;
mul.f32 f96, f89, f89;
sub.f32 f97, f95, f96;
mul.f32 f98, f89, f88;
fma.rn.f32 f99, f89, f88, f98;
mul.f32 f100, f97, f80;
mul.f32 f101, f99, f81;
mul.f32 f102, f97, f81;
mul.f32 f103, f88, f97;
mul.f32 f104, f89, f99;
sub.f32 f105, f103, f104;
mul.f32 f106, f88, f99;
fma.rn.f32 f107, f89, f97, f106;
mul.f32 f108, f105, f84;
mul.f32 f109, f107, f85;
mul.f32 f110, f105, f85;
mul.f32 f111, f88, f105;
mul.f32 f112, f89, f107;
sub.f32 f113, f111, f112;
mul.f32 f114, f88, f107;
fma.rn.f32 f115, f89, f105, f114;
mul.f32 f116, f113, f74;
mul.f32 f117, f115, f75;
mul.f32 f118, f113, f75;
mul.f32 f119, f88, f113;
mul.f32 f120, f89, f115;
sub.f32 f121, f119, f120;
mul.f32 f122, f88, f115;
fma.rn.f32 f123, f89, f113, f122;
mul.f32 f124, f121, f78;
mul.f32 f125, f123, f79;
mul.f32 f126, f121, f79;
mul.f32 f127, f88, f121;
mul.f32 f128, f89, f123;
sub.f32 f129, f127, f128;
mul.f32 f130, f88, f123;
fma.rn.f32 f131, f89, f121, f130;
mul.f32 f132, f129, f82;
mul.f32 f133, f131, f83;
mul.f32 f134, f129, f83;
mul.f32 f135, f88, f129;
mul.f32 f136, f89, f131;
sub.f32 f137, f135, f136;
mul.f32 f138, f88, f131;
fma.rn.f32 f139, f89, f129, f138;
mul.f32 f140, f137, f86;
mul.f32 f141, f139, f87;
mul.f32 f142, f137, f87;
barrier.sync 0;
and.b32 r11, r7, 8128;
add.s32 r12, r9, r11;
add.f32 f143, f42, f58;
add.f32 f144, f41, f57;
fma.rn.f32 f145, f89, f76, f94;
sub.f32 f146, f92, f93;
st.shared.v4.f32 [r12], {f144, f143, f146, f145};
fma.rn.f32 f147, f99, f80, f102;
sub.f32 f148, f100, f101;
sub.f32 f149, f108, f109;
fma.rn.f32 f150, f107, f84, f110;
st.shared.v4.f32 [r12+16], {f148, f147, f149, f150};
fma.rn.f32 f151, f115, f74, f118;
sub.f32 f152, f116, f117;
fma.rn.f32 f153, f123, f78, f126;
sub.f32 f154, f124, f125;
st.shared.v4.f32 [r12+32], {f152, f151, f154, f153};
fma.rn.f32 f155, f131, f82, f134;
sub.f32 f156, f132, f133;
fma.rn.f32 f157, f139, f86, f142;
sub.f32 f158, f140, f141;
st.shared.v4.f32 [r12+48], {f156, f155, f158, f157};
barrier.sync 0;
mad.lo.s32 r13, r6, -56, r12;
ld.shared.v2.f32 {f159, f160}, [r13];
ld.shared.v2.f32 {f163, f164}, [r13+1024];
ld.shared.v2.f32 {f167, f168}, [r13+2048];
ld.shared.v2.f32 {f171, f172}, [r13+3072];
ld.shared.v2.f32 {f175, f176}, [r13+4096];
ld.shared.v2.f32 {f179, f180}, [r13+5120];
ld.shared.v2.f32 {f183, f184}, [r13+6144];
ld.shared.v2.f32 {f187, f188}, [r13+7168];
add.f32 f191, f159, f175;
add.f32 f192, f160, f176;
sub.f32 f193, f159, f175;
sub.f32 f194, f160, f176;
add.f32 f195, f167, f183;
add.f32 f196, f168, f184;
sub.f32 f197, f167, f183;
sub.f32 f198, f168, f184;
add.f32 f199, f191, f195;
add.f32 f200, f192, f196;
sub.f32 f201, f191, f195;
sub.f32 f202, f192, f196;
add.f32 f203, f193, f198;
sub.f32 f204, f194, f197;
sub.f32 f205, f193, f198;
add.f32 f206, f194, f197;
add.f32 f207, f163, f179;
add.f32 f208, f164, f180;
sub.f32 f209, f163, f179;
sub.f32 f210, f164, f180;
add.f32 f211, f171, f187;
add.f32 f212, f172, f188;
sub.f32 f213, f171, f187;
sub.f32 f214, f172, f188;
add.f32 f215, f207, f211;
add.f32 f216, f208, f212;
sub.f32 f217, f207, f211;
sub.f32 f218, f208, f212;
add.f32 f219, f209, f214;
sub.f32 f220, f210, f213;
sub.f32 f221, f209, f214;
add.f32 f222, f210, f213;
mul.f32 f223, f219, 0f3F3504F3;
mul.f32 f224, f220, 0fBF3504F3;
sub.f32 f225, f223, f224;
mul.f32 f226, f220, 0f3F3504F3;
fma.rn.f32 f227, f219, 0fBF3504F3, f226;
mul.f32 f228, f221, 0fBF3504F3;
mul.f32 f229, f222, 0fBF3504F3;
sub.f32 f230, f228, f229;
add.f32 f231, f228, f229;
sub.f32 f232, f199, f215;
sub.f32 f233, f200, f216;
add.f32 f234, f203, f225;
add.f32 f235, f204, f227;
sub.f32 f236, f203, f225;
sub.f32 f237, f204, f227;
add.f32 f238, f201, f218;
sub.f32 f239, f202, f217;
sub.f32 f240, f201, f218;
add.f32 f241, f202, f217;
add.f32 f242, f205, f230;
add.f32 f243, f206, f231;
sub.f32 f244, f205, f230;
sub.f32 f245, f206, f231;
and.b32 r14, r5, 120;
cvt.u64.u32 rd6, r14;
mov.u64 rd7, %18;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f246, f247}, [rd8];
mul.f32 f250, f246, f234;
mul.f32 f251, f247, f235;
mul.f32 f252, f246, f235;
mul.f32 f253, f246, f246;
mul.f32 f254, f247, f247;
sub.f32 f255, f253, f254;
mul.f32 f256, f247, f246;
fma.rn.f32 f257, f247, f246, f256;
mul.f32 f258, f255, f238;
mul.f32 f259, f257, f239;
mul.f32 f260, f255, f239;
mul.f32 f261, f246, f255;
mul.f32 f262, f247, f257;
sub.f32 f263, f261, f262;
mul.f32 f264, f246, f257;
fma.rn.f32 f265, f247, f255, f264;
mul.f32 f266, f263, f242;
mul.f32 f267, f265, f243;
mul.f32 f268, f263, f243;
mul.f32 f269, f246, f263;
mul.f32 f270, f247, f265;
sub.f32 f271, f269, f270;
mul.f32 f272, f246, f265;
fma.rn.f32 f273, f247, f263, f272;
mul.f32 f274, f271, f232;
mul.f32 f275, f273, f233;
mul.f32 f276, f271, f233;
mul.f32 f277, f246, f271;
mul.f32 f278, f247, f273;
sub.f32 f279, f277, f278;
mul.f32 f280, f246, f273;
fma.rn.f32 f281, f247, f271, f280;
mul.f32 f282, f279, f236;
mul.f32 f283, f281, f237;
mul.f32 f284, f279, f237;
mul.f32 f285, f246, f279;
mul.f32 f286, f247, f281;
sub.f32 f287, f285, f286;
mul.f32 f288, f246, f281;
fma.rn.f32 f289, f247, f279, f288;
mul.f32 f290, f287, f240;
mul.f32 f291, f289, f241;
mul.f32 f292, f287, f241;
mul.f32 f293, f246, f287;
mul.f32 f294, f247, f289;
sub.f32 f295, f293, f294;
mul.f32 f296, f246, f289;
fma.rn.f32 f297, f247, f287, f296;
mul.f32 f298, f295, f244;
mul.f32 f299, f297, f245;
mul.f32 f300, f295, f245;
and.b32 r15, r10, 56;
add.s32 r16, r9, r15;
barrier.sync 0;
and.b32 r17, r7, 7680;
add.s32 r18, r16, r17;
add.f32 f301, f200, f216;
add.f32 f302, f199, f215;
st.shared.v2.f32 [r18], {f302, f301};
fma.rn.f32 f303, f247, f234, f252;
sub.f32 f304, f250, f251;
st.shared.v2.f32 [r18+64], {f304, f303};
fma.rn.f32 f305, f257, f238, f260;
sub.f32 f306, f258, f259;
st.shared.v2.f32 [r18+128], {f306, f305};
fma.rn.f32 f307, f265, f242, f268;
sub.f32 f308, f266, f267;
st.shared.v2.f32 [r18+192], {f308, f307};
sub.f32 f309, f274, f275;
fma.rn.f32 f310, f273, f232, f276;
st.shared.v2.f32 [r18+256], {f309, f310};
fma.rn.f32 f311, f281, f236, f284;
sub.f32 f312, f282, f283;
st.shared.v2.f32 [r18+320], {f312, f311};
fma.rn.f32 f313, f289, f240, f292;
sub.f32 f314, f290, f291;
st.shared.v2.f32 [r18+384], {f314, f313};
fma.rn.f32 f315, f297, f244, f300;
sub.f32 f316, f298, f299;
st.shared.v2.f32 [r18+448], {f316, f315};
barrier.sync 0;
mad.lo.s32 r19, r14, -56, r18;
ld.shared.v2.f32 {f317, f318}, [r19];
ld.shared.v2.f32 {f321, f322}, [r19+1024];
ld.shared.v2.f32 {f325, f326}, [r19+2048];
ld.shared.v2.f32 {f329, f330}, [r19+3072];
ld.shared.v2.f32 {f333, f334}, [r19+4096];
ld.shared.v2.f32 {f337, f338}, [r19+5120];
ld.shared.v2.f32 {f341, f342}, [r19+6144];
ld.shared.v2.f32 {f345, f346}, [r19+7168];
add.f32 f349, f317, f333;
add.f32 f350, f318, f334;
sub.f32 f351, f317, f333;
sub.f32 f352, f318, f334;
add.f32 f353, f325, f341;
add.f32 f354, f326, f342;
sub.f32 f355, f325, f341;
sub.f32 f356, f326, f342;
add.f32 f357, f349, f353;
add.f32 f358, f350, f354;
sub.f32 f359, f349, f353;
sub.f32 f360, f350, f354;
add.f32 f361, f351, f356;
sub.f32 f362, f352, f355;
sub.f32 f363, f351, f356;
add.f32 f364, f352, f355;
add.f32 f365, f321, f337;
add.f32 f366, f322, f338;
sub.f32 f367, f321, f337;
sub.f32 f368, f322, f338;
add.f32 f369, f329, f345;
add.f32 f370, f330, f346;
sub.f32 f371, f329, f345;
sub.f32 f372, f330, f346;
add.f32 f373, f365, f369;
add.f32 f374, f366, f370;
sub.f32 f375, f365, f369;
sub.f32 f376, f366, f370;
add.f32 f377, f367, f372;
sub.f32 f378, f368, f371;
sub.f32 f379, f367, f372;
add.f32 f380, f368, f371;
mul.f32 f381, f377, 0f3F3504F3;
mul.f32 f382, f378, 0fBF3504F3;
sub.f32 f383, f381, f382;
mul.f32 f384, f378, 0f3F3504F3;
fma.rn.f32 f385, f377, 0fBF3504F3, f384;
mul.f32 f386, f379, 0fBF3504F3;
mul.f32 f387, f380, 0fBF3504F3;
sub.f32 f388, f386, f387;
add.f32 f389, f386, f387;
sub.f32 f390, f357, f373;
sub.f32 f391, f358, f374;
add.f32 f392, f361, f383;
add.f32 f393, f362, f385;
sub.f32 f394, f361, f383;
sub.f32 f395, f362, f385;
add.f32 f396, f359, f376;
sub.f32 f397, f360, f375;
sub.f32 f398, f359, f376;
add.f32 f399, f360, f375;
add.f32 f400, f363, f388;
add.f32 f401, f364, f389;
sub.f32 f402, f363, f388;
sub.f32 f403, f364, f389;
and.b32 r20, r5, 64;
bfe.u32 r21, r5, 6, 1;
mul.wide.u32 rd9, r21, 8;
mov.u64 rd10, %19;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f404, f405}, [rd11];
mul.f32 f408, f404, f392;
mul.f32 f409, f405, f393;
mul.f32 f410, f404, f393;
mul.f32 f411, f404, f404;
mul.f32 f412, f405, f405;
sub.f32 f413, f411, f412;
mul.f32 f414, f405, f404;
fma.rn.f32 f415, f405, f404, f414;
mul.f32 f416, f413, f396;
mul.f32 f417, f415, f397;
mul.f32 f418, f413, f397;
mul.f32 f419, f404, f413;
mul.f32 f420, f405, f415;
sub.f32 f421, f419, f420;
mul.f32 f422, f404, f415;
fma.rn.f32 f423, f405, f413, f422;
mul.f32 f424, f421, f400;
mul.f32 f425, f423, f401;
mul.f32 f426, f421, f401;
mul.f32 f427, f404, f421;
mul.f32 f428, f405, f423;
sub.f32 f429, f427, f428;
mul.f32 f430, f404, f423;
fma.rn.f32 f431, f405, f421, f430;
mul.f32 f432, f429, f390;
mul.f32 f433, f431, f391;
mul.f32 f434, f429, f391;
mul.f32 f435, f404, f429;
mul.f32 f436, f405, f431;
sub.f32 f437, f435, f436;
mul.f32 f438, f404, f431;
fma.rn.f32 f439, f405, f429, f438;
mul.f32 f440, f437, f394;
mul.f32 f441, f439, f395;
mul.f32 f442, f437, f395;
mul.f32 f443, f404, f437;
mul.f32 f444, f405, f439;
sub.f32 f445, f443, f444;
mul.f32 f446, f404, f439;
fma.rn.f32 f447, f405, f437, f446;
mul.f32 f448, f445, f398;
mul.f32 f449, f447, f399;
mul.f32 f450, f445, f399;
mul.f32 f451, f404, f445;
mul.f32 f452, f405, f447;
sub.f32 f453, f451, f452;
mul.f32 f454, f404, f447;
fma.rn.f32 f455, f405, f445, f454;
mul.f32 f456, f453, f402;
mul.f32 f457, f455, f403;
mul.f32 f458, f453, f403;
and.b32 r22, r10, 504;
add.s32 r23, r9, r22;
barrier.sync 0;
and.b32 r24, r7, 4096;
add.s32 r25, r23, r24;
add.f32 f459, f358, f374;
add.f32 f460, f357, f373;
st.shared.v2.f32 [r25], {f460, f459};
fma.rn.f32 f461, f405, f392, f410;
sub.f32 f462, f408, f409;
st.shared.v2.f32 [r25+512], {f462, f461};
fma.rn.f32 f463, f415, f396, f418;
sub.f32 f464, f416, f417;
st.shared.v2.f32 [r25+1024], {f464, f463};
fma.rn.f32 f465, f423, f400, f426;
sub.f32 f466, f424, f425;
st.shared.v2.f32 [r25+1536], {f466, f465};
sub.f32 f467, f432, f433;
fma.rn.f32 f468, f431, f390, f434;
st.shared.v2.f32 [r25+2048], {f467, f468};
fma.rn.f32 f469, f439, f394, f442;
sub.f32 f470, f440, f441;
st.shared.v2.f32 [r25+2560], {f470, f469};
fma.rn.f32 f471, f447, f398, f450;
sub.f32 f472, f448, f449;
st.shared.v2.f32 [r25+3072], {f472, f471};
fma.rn.f32 f473, f455, f402, f458;
sub.f32 f474, f456, f457;
st.shared.v2.f32 [r25+3584], {f474, f473};
barrier.sync 0;
mad.lo.s32 r26, r20, -56, r25;
ld.shared.v2.f32 {f475, f476}, [r26];
ld.shared.v2.f32 {f479, f480}, [r26+1024];
ld.shared.v2.f32 {f483, f484}, [r26+2048];
ld.shared.v2.f32 {f487, f488}, [r26+3072];
ld.shared.v2.f32 {f491, f492}, [r26+4096];
ld.shared.v2.f32 {f495, f496}, [r26+5120];
ld.shared.v2.f32 {f499, f500}, [r26+6144];
ld.shared.v2.f32 {f503, f504}, [r26+7168];
add.f32 %1, f476, f492;
add.f32 %0, f475, f491;
add.f32 %3, f480, f496;
add.f32 %2, f479, f495;
add.f32 %5, f484, f500;
add.f32 %4, f483, f499;
add.f32 %7, f488, f504;
add.f32 %6, f487, f503;
sub.f32 %9, f476, f492;
sub.f32 %8, f475, f491;
sub.f32 %11, f480, f496;
sub.f32 %10, f479, f495;
sub.f32 %13, f484, f500;
sub.f32 %12, f483, f499;
sub.f32 %15, f488, f504;
sub.f32 %14, f487, f503;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_1024), "l"(lut_sp_8_128), "l"(lut_sp_8_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<86, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<1097>;
.reg .b32 r<35>;
.reg .b64 rd<12>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 13;
mov.u32 r3, %32;
add.s32 r4, r3, r2;
add.f32 f65, %35, %51;
sub.f32 f67, %35, %51;
add.f32 f1088, %36, %67;
sub.f32 f68, %36, %67;
add.f32 f69, %43, %59;
sub.f32 f71, %43, %59;
add.f32 f1086, %68, %60;
sub.f32 f72, %68, %60;
add.f32 f73, f65, f69;
sub.f32 f75, f65, f69;
add.f32 f1085, f1088, f1086;
sub.f32 f76, f1088, f1086;
add.f32 f77, f67, f72;
sub.f32 f79, f67, f72;
sub.f32 f1084, f68, f71;
add.f32 f80, f68, f71;
add.f32 f81, %39, %55;
sub.f32 f83, %39, %55;
add.f32 f1081, %70, %69;
sub.f32 f84, %70, %69;
add.f32 f85, %47, %63;
sub.f32 f87, %47, %63;
add.f32 f1079, %48, %71;
sub.f32 f88, %48, %71;
add.f32 f89, f81, f85;
sub.f32 f91, f81, f85;
add.f32 f1078, f1081, f1079;
sub.f32 f92, f1081, f1079;
add.f32 f93, f83, f88;
sub.f32 f95, f83, f88;
sub.f32 f1077, f84, f87;
add.f32 f96, f84, f87;
mul.f32 f98, f1077, 0fBF3504F3;
mul.f32 f1076, f93, 0f3F3504F3;
sub.f32 f99, f1076, f98;
mul.f32 f100, f1077, 0f3F3504F3;
fma.rn.f32 f101, f93, 0fBF3504F3, f100;
mul.f32 f102, f95, 0fBF3504F3;
mul.f32 f103, f96, 0fBF3504F3;
sub.f32 f104, f102, f103;
add.f32 f105, f102, f103;
add.f32 f106, f73, f89;
sub.f32 f108, f73, f89;
add.f32 f1075, f1085, f1078;
sub.f32 f109, f1085, f1078;
add.f32 f110, f77, f99;
sub.f32 f112, f77, f99;
add.f32 f1074, f1084, f101;
sub.f32 f113, f1084, f101;
add.f32 f114, f75, f92;
sub.f32 f116, f75, f92;
sub.f32 f1073, f76, f91;
add.f32 f117, f76, f91;
add.f32 f118, f79, f104;
sub.f32 f120, f79, f104;
add.f32 f1072, f80, f105;
sub.f32 f121, f80, f105;
add.f32 f122, %37, %53;
sub.f32 f124, %37, %53;
add.f32 f1070, %72, %54;
sub.f32 f125, %72, %54;
add.f32 f126, %45, %61;
sub.f32 f128, %45, %61;
add.f32 f1067, %73, %74;
sub.f32 f129, %73, %74;
add.f32 f130, f122, f126;
sub.f32 f132, f122, f126;
add.f32 f1066, f1070, f1067;
sub.f32 f133, f1070, f1067;
add.f32 f134, f124, f129;
sub.f32 f136, f124, f129;
sub.f32 f1065, f125, f128;
add.f32 f137, f125, f128;
add.f32 f138, %41, %57;
sub.f32 f140, %41, %57;
add.f32 f1063, %42, %75;
sub.f32 f141, %42, %75;
add.f32 f142, %49, %65;
sub.f32 f144, %49, %65;
add.f32 f1061, %76, %66;
sub.f32 f145, %76, %66;
add.f32 f146, f138, f142;
sub.f32 f148, f138, f142;
add.f32 f1060, f1063, f1061;
sub.f32 f149, f1063, f1061;
add.f32 f150, f140, f145;
sub.f32 f152, f140, f145;
sub.f32 f1059, f141, f144;
add.f32 f153, f141, f144;
mul.f32 f155, f1059, 0fBF3504F3;
mul.f32 f1058, f150, 0f3F3504F3;
sub.f32 f156, f1058, f155;
mul.f32 f157, f1059, 0f3F3504F3;
fma.rn.f32 f158, f150, 0fBF3504F3, f157;
mul.f32 f159, f152, 0fBF3504F3;
mul.f32 f160, f153, 0fBF3504F3;
sub.f32 f161, f159, f160;
add.f32 f162, f159, f160;
add.f32 f163, f130, f146;
sub.f32 f165, f130, f146;
add.f32 f1057, f1066, f1060;
sub.f32 f166, f1066, f1060;
add.f32 f167, f134, f156;
sub.f32 f169, f134, f156;
add.f32 f1056, f1065, f158;
sub.f32 f170, f1065, f158;
add.f32 f171, f132, f149;
sub.f32 f173, f132, f149;
sub.f32 f1055, f133, f148;
add.f32 f174, f133, f148;
add.f32 f175, f136, f161;
sub.f32 f177, f136, f161;
add.f32 f1054, f137, f162;
sub.f32 f178, f137, f162;
mul.f32 f1052, f167, 0f3F6C835E;
mul.f32 f1053, f1056, 0fBEC3EF15;
sub.f32 f181, f1052, f1053;
mul.f32 f182, f1056, 0f3F6C835E;
fma.rn.f32 f183, f167, 0fBEC3EF15, f182;
mul.f32 f1050, f171, 0f3F3504F3;
mul.f32 f1051, f1055, 0fBF3504F3;
sub.f32 f186, f1050, f1051;
mul.f32 f187, f1055, 0f3F3504F3;
fma.rn.f32 f188, f171, 0fBF3504F3, f187;
mul.f32 f1048, f175, 0f3EC3EF15;
mul.f32 f1049, f1054, 0fBF6C835E;
sub.f32 f191, f1048, f1049;
mul.f32 f192, f1054, 0f3EC3EF15;
fma.rn.f32 f193, f175, 0fBF6C835E, f192;
mul.f32 f1046, f169, 0fBEC3EF15;
mul.f32 f1047, f170, 0fBF6C835E;
sub.f32 f196, f1046, f1047;
mul.f32 f197, f170, 0fBEC3EF15;
fma.rn.f32 f198, f169, 0fBF6C835E, f197;
mul.f32 f199, f173, 0fBF3504F3;
mul.f32 f200, f174, 0fBF3504F3;
sub.f32 f201, f199, f200;
add.f32 f202, f199, f200;
mul.f32 f1044, f177, 0fBF6C835E;
mul.f32 f1045, f178, 0fBEC3EF15;
sub.f32 f205, f1044, f1045;
mul.f32 f206, f178, 0fBF6C835E;
fma.rn.f32 f207, f177, 0fBEC3EF15, f206;
add.f32 f210, f110, f181;
sub.f32 f212, f110, f181;
add.f32 f1043, f1074, f183;
sub.f32 f213, f1074, f183;
add.f32 f214, f114, f186;
sub.f32 f216, f114, f186;
add.f32 f1042, f1073, f188;
sub.f32 f217, f1073, f188;
add.f32 f218, f118, f191;
sub.f32 f220, f118, f191;
add.f32 f1041, f1072, f193;
sub.f32 f221, f1072, f193;
add.f32 f222, f108, f166;
sub.f32 f224, f108, f166;
sub.f32 f1040, f109, f165;
add.f32 f225, f109, f165;
add.f32 f226, f112, f196;
sub.f32 f228, f112, f196;
add.f32 f1039, f113, f198;
sub.f32 f229, f113, f198;
add.f32 f230, f116, f201;
sub.f32 f232, f116, f201;
add.f32 f1038, f117, f202;
sub.f32 f233, f117, f202;
add.f32 f234, f120, f205;
sub.f32 f236, f120, f205;
add.f32 f1037, f121, f207;
sub.f32 f237, f121, f207;
mov.u32 r22, %tid.x;
shl.b32 r7, r22, 7;
and.b32 r8, r7, -8192;
add.s32 r9, r4, r8;
shl.b32 r10, r22, 3;
cvt.u64.u32 rd2, r10;
and.b64 rd3, rd2, 504;
mov.u64 rd4, %33;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f238, f239}, [rd5];
mul.f32 f243, f239, f1043;
mul.f32 f244, f238, f1043;
mul.f32 f246, f239, f239;
mul.f32 f1036, f238, f238;
sub.f32 f247, f1036, f246;
mul.f32 f248, f239, f238;
fma.rn.f32 f249, f239, f238, f248;
mul.f32 f251, f249, f1042;
mul.f32 f252, f247, f1042;
mul.f32 f1034, f238, f247;
mul.f32 f1035, f239, f249;
sub.f32 f255, f1034, f1035;
mul.f32 f1033, f247, f214;
mul.f32 f256, f238, f249;
fma.rn.f32 f257, f239, f247, f256;
mul.f32 f259, f257, f1041;
mul.f32 f260, f255, f1041;
mul.f32 f262, f239, f257;
mul.f32 f1032, f238, f255;
sub.f32 f263, f1032, f262;
mul.f32 f1031, f255, f218;
mul.f32 f264, f238, f257;
fma.rn.f32 f265, f239, f255, f264;
mul.f32 f267, f265, f1040;
mul.f32 f268, f263, f1040;
mul.f32 f270, f239, f265;
mul.f32 f1030, f238, f263;
sub.f32 f271, f1030, f270;
mul.f32 f1029, f263, f222;
mul.f32 f272, f238, f265;
fma.rn.f32 f273, f239, f263, f272;
mul.f32 f275, f273, f1039;
mul.f32 f276, f271, f1039;
mul.f32 f1027, f238, f271;
mul.f32 f1028, f239, f273;
sub.f32 f279, f1027, f1028;
mul.f32 f1026, f271, f226;
mul.f32 f280, f238, f273;
fma.rn.f32 f281, f239, f271, f280;
mul.f32 f283, f281, f1038;
mul.f32 f284, f279, f1038;
mul.f32 f286, f239, f281;
mul.f32 f1025, f238, f279;
sub.f32 f287, f1025, f286;
mul.f32 f1024, f279, f230;
mul.f32 f288, f238, f281;
fma.rn.f32 f289, f239, f279, f288;
mul.f32 f291, f289, f1037;
mul.f32 f292, f287, f1037;
mul.f32 f294, f239, f289;
mul.f32 f1023, f238, f287;
sub.f32 f295, f1023, f294;
mul.f32 f1022, f287, f234;
mul.f32 f296, f238, f289;
fma.rn.f32 f297, f239, f287, f296;
sub.f32 f1021, f1075, f1057;
mul.f32 f299, f297, f1021;
mul.f32 f300, f295, f1021;
mul.f32 f1019, f238, f295;
mul.f32 f1020, f239, f297;
sub.f32 f303, f1019, f1020;
sub.f32 f1018, f106, f163;
mul.f32 f1017, f295, f1018;
mul.f32 f304, f238, f297;
fma.rn.f32 f305, f239, f295, f304;
mul.f32 f307, f305, f213;
mul.f32 f308, f303, f213;
mul.f32 f310, f239, f305;
mul.f32 f1016, f238, f303;
sub.f32 f311, f1016, f310;
mul.f32 f1015, f303, f212;
mul.f32 f312, f238, f305;
fma.rn.f32 f313, f239, f303, f312;
mul.f32 f315, f313, f217;
mul.f32 f316, f311, f217;
mul.f32 f1013, f238, f311;
mul.f32 f1014, f239, f313;
sub.f32 f319, f1013, f1014;
mul.f32 f1012, f311, f216;
mul.f32 f320, f238, f313;
fma.rn.f32 f321, f239, f311, f320;
mul.f32 f323, f321, f221;
mul.f32 f324, f319, f221;
mul.f32 f326, f239, f321;
mul.f32 f1011, f238, f319;
sub.f32 f327, f1011, f326;
mul.f32 f1010, f319, f220;
mul.f32 f328, f238, f321;
fma.rn.f32 f329, f239, f319, f328;
mul.f32 f331, f329, f225;
mul.f32 f332, f327, f225;
mul.f32 f334, f239, f329;
mul.f32 f1009, f238, f327;
sub.f32 f335, f1009, f334;
mul.f32 f1008, f327, f224;
mul.f32 f336, f238, f329;
fma.rn.f32 f337, f239, f327, f336;
mul.f32 f339, f337, f229;
mul.f32 f340, f335, f229;
mul.f32 f1006, f238, f335;
mul.f32 f1007, f239, f337;
sub.f32 f343, f1006, f1007;
mul.f32 f1005, f335, f228;
mul.f32 f344, f238, f337;
fma.rn.f32 f345, f239, f335, f344;
mul.f32 f347, f345, f233;
mul.f32 f348, f343, f233;
mul.f32 f350, f239, f345;
mul.f32 f1004, f238, f343;
sub.f32 f351, f1004, f350;
mul.f32 f1003, f238, f210;
mul.f32 f352, f238, f345;
mul.f32 f1002, f343, f232;
fma.rn.f32 f353, f239, f343, f352;
mul.f32 f354, f351, f236;
mul.f32 f355, f353, f237;
mul.f32 f356, f351, f237;
sub.f32 f1094, f1075, f1057;
mul.f32 f1093, f297, f1094;
barrier.sync 0;
and.b32 r11, r7, 8064;
add.s32 r12, r9, r11;
add.f32 f357, f1075, f1057;
sub.f32 f1091, f106, f163;
add.f32 f358, f106, f163;
mov.u32 r34, %tid.x;
fma.rn.f32 f359, f239, f210, f244;
sub.f32 f360, f1003, f243;
st.shared.v4.f32 [r12], {f358, f357, f360, f359};
fma.rn.f32 f361, f249, f214, f252;
sub.f32 f362, f1033, f251;
fma.rn.f32 f363, f257, f218, f260;
sub.f32 f364, f1031, f259;
st.shared.v4.f32 [r12+16], {f362, f361, f364, f363};
sub.f32 f365, f1029, f267;
fma.rn.f32 f366, f265, f222, f268;
fma.rn.f32 f367, f273, f226, f276;
sub.f32 f368, f1026, f275;
st.shared.v4.f32 [r12+32], {f365, f366, f368, f367};
fma.rn.f32 f369, f281, f230, f284;
sub.f32 f370, f1024, f283;
fma.rn.f32 f371, f289, f234, f292;
sub.f32 f372, f1022, f291;
st.shared.v4.f32 [r12+48], {f370, f369, f372, f371};
fma.rn.f32 f373, f297, f1091, f300;
sub.f32 f374, f1017, f1093;
fma.rn.f32 f375, f305, f212, f308;
sub.f32 f376, f1015, f307;
st.shared.v4.f32 [r12+64], {f374, f373, f376, f375};
fma.rn.f32 f377, f313, f216, f316;
sub.f32 f378, f1012, f315;
fma.rn.f32 f379, f321, f220, f324;
sub.f32 f380, f1010, f323;
st.shared.v4.f32 [r12+80], {f378, f377, f380, f379};
fma.rn.f32 f381, f329, f224, f332;
sub.f32 f382, f1008, f331;
fma.rn.f32 f383, f337, f228, f340;
sub.f32 f384, f1005, f339;
st.shared.v4.f32 [r12+96], {f382, f381, f384, f383};
fma.rn.f32 f385, f345, f232, f348;
sub.f32 f386, f1002, f347;
fma.rn.f32 f387, f353, f236, f356;
sub.f32 f388, f354, f355;
st.shared.v4.f32 [r12+112], {f386, f385, f388, f387};
barrier.sync 0;
and.b32 r21, r34, 63;
mad.lo.s32 r13, r21, -120, r12;
ld.shared.v2.f32 {f389, f390}, [r13];
ld.shared.v2.f32 {f393, f394}, [r13+512];
ld.shared.v2.f32 {f397, f398}, [r13+1024];
ld.shared.v2.f32 {f401, f402}, [r13+1536];
ld.shared.v2.f32 {f405, f406}, [r13+2048];
ld.shared.v2.f32 {f409, f410}, [r13+2560];
ld.shared.v2.f32 {f413, f414}, [r13+3072];
ld.shared.v2.f32 {f417, f418}, [r13+3584];
ld.shared.v2.f32 {f421, f422}, [r13+4096];
ld.shared.v2.f32 {f425, f426}, [r13+4608];
ld.shared.v2.f32 {f429, f430}, [r13+5120];
ld.shared.v2.f32 {f433, f434}, [r13+5632];
ld.shared.v2.f32 {f437, f438}, [r13+6144];
ld.shared.v2.f32 {f441, f442}, [r13+6656];
ld.shared.v2.f32 {f445, f446}, [r13+7168];
ld.shared.v2.f32 {f449, f450}, [r13+7680];
add.f32 f453, f389, f421;
sub.f32 f455, f389, f421;
add.f32 f1001, f390, f422;
sub.f32 f456, f390, f422;
add.f32 f457, f405, f437;
sub.f32 f459, f405, f437;
add.f32 f1000, f406, f438;
sub.f32 f460, f406, f438;
add.f32 f461, f453, f457;
sub.f32 f463, f453, f457;
add.f32 f999, f1001, f1000;
sub.f32 f464, f1001, f1000;
add.f32 f465, f455, f460;
sub.f32 f467, f455, f460;
sub.f32 f998, f456, f459;
add.f32 f468, f456, f459;
add.f32 f469, f397, f429;
sub.f32 f471, f397, f429;
add.f32 f997, f398, f430;
sub.f32 f472, f398, f430;
add.f32 f473, f413, f445;
sub.f32 f475, f413, f445;
add.f32 f996, f414, f446;
sub.f32 f476, f414, f446;
add.f32 f477, f469, f473;
sub.f32 f479, f469, f473;
add.f32 f995, f997, f996;
sub.f32 f480, f997, f996;
add.f32 f481, f471, f476;
sub.f32 f483, f471, f476;
sub.f32 f994, f472, f475;
add.f32 f484, f472, f475;
mul.f32 f992, f481, 0f3F3504F3;
mul.f32 f993, f994, 0fBF3504F3;
sub.f32 f487, f992, f993;
mul.f32 f488, f994, 0f3F3504F3;
fma.rn.f32 f489, f481, 0fBF3504F3, f488;
mul.f32 f490, f483, 0fBF3504F3;
mul.f32 f491, f484, 0fBF3504F3;
sub.f32 f492, f490, f491;
add.f32 f493, f490, f491;
add.f32 f494, f461, f477;
sub.f32 f496, f461, f477;
add.f32 f991, f999, f995;
sub.f32 f497, f999, f995;
add.f32 f498, f465, f487;
sub.f32 f500, f465, f487;
add.f32 f990, f998, f489;
sub.f32 f501, f998, f489;
add.f32 f502, f463, f480;
sub.f32 f504, f463, f480;
sub.f32 f989, f464, f479;
add.f32 f505, f464, f479;
add.f32 f506, f467, f492;
sub.f32 f508, f467, f492;
add.f32 f988, f468, f493;
sub.f32 f509, f468, f493;
add.f32 f510, f393, f425;
sub.f32 f512, f393, f425;
add.f32 f987, f394, f426;
sub.f32 f513, f394, f426;
add.f32 f514, f409, f441;
sub.f32 f516, f409, f441;
add.f32 f986, f410, f442;
sub.f32 f517, f410, f442;
add.f32 f518, f510, f514;
sub.f32 f520, f510, f514;
add.f32 f985, f987, f986;
sub.f32 f521, f987, f986;
add.f32 f522, f512, f517;
sub.f32 f524, f512, f517;
sub.f32 f984, f513, f516;
add.f32 f525, f513, f516;
add.f32 f526, f401, f433;
sub.f32 f528, f401, f433;
add.f32 f983, f402, f434;
sub.f32 f529, f402, f434;
add.f32 f530, f417, f449;
sub.f32 f532, f417, f449;
add.f32 f982, f418, f450;
sub.f32 f533, f418, f450;
add.f32 f534, f526, f530;
sub.f32 f536, f526, f530;
add.f32 f981, f983, f982;
sub.f32 f537, f983, f982;
add.f32 f538, f528, f533;
sub.f32 f540, f528, f533;
sub.f32 f980, f529, f532;
add.f32 f541, f529, f532;
mul.f32 f978, f538, 0f3F3504F3;
mul.f32 f979, f980, 0fBF3504F3;
sub.f32 f544, f978, f979;
mul.f32 f545, f980, 0f3F3504F3;
fma.rn.f32 f546, f538, 0fBF3504F3, f545;
mul.f32 f547, f540, 0fBF3504F3;
mul.f32 f548, f541, 0fBF3504F3;
sub.f32 f549, f547, f548;
add.f32 f550, f547, f548;
add.f32 f551, f518, f534;
sub.f32 f553, f518, f534;
add.f32 f977, f985, f981;
sub.f32 f554, f985, f981;
add.f32 f555, f522, f544;
sub.f32 f557, f522, f544;
add.f32 f976, f984, f546;
sub.f32 f558, f984, f546;
add.f32 f559, f520, f537;
sub.f32 f561, f520, f537;
sub.f32 f975, f521, f536;
add.f32 f562, f521, f536;
add.f32 f563, f524, f549;
sub.f32 f565, f524, f549;
add.f32 f974, f525, f550;
sub.f32 f566, f525, f550;
mul.f32 f568, f976, 0fBEC3EF15;
mul.f32 f973, f555, 0f3F6C835E;
sub.f32 f569, f973, f568;
mul.f32 f570, f976, 0f3F6C835E;
fma.rn.f32 f571, f555, 0fBEC3EF15, f570;
mul.f32 f573, f975, 0fBF3504F3;
mul.f32 f972, f559, 0f3F3504F3;
sub.f32 f574, f972, f573;
mul.f32 f575, f975, 0f3F3504F3;
fma.rn.f32 f576, f559, 0fBF3504F3, f575;
mul.f32 f970, f563, 0f3EC3EF15;
mul.f32 f971, f974, 0fBF6C835E;
sub.f32 f579, f970, f971;
mul.f32 f580, f974, 0f3EC3EF15;
fma.rn.f32 f581, f563, 0fBF6C835E, f580;
mul.f32 f968, f557, 0fBEC3EF15;
mul.f32 f969, f558, 0fBF6C835E;
sub.f32 f584, f968, f969;
mul.f32 f585, f558, 0fBEC3EF15;
fma.rn.f32 f586, f557, 0fBF6C835E, f585;
mul.f32 f587, f561, 0fBF3504F3;
mul.f32 f588, f562, 0fBF3504F3;
sub.f32 f589, f587, f588;
add.f32 f590, f587, f588;
mul.f32 f592, f566, 0fBEC3EF15;
mul.f32 f967, f565, 0fBF6C835E;
sub.f32 f593, f967, f592;
mul.f32 f594, f566, 0fBF6C835E;
fma.rn.f32 f595, f565, 0fBEC3EF15, f594;
add.f32 f598, f498, f569;
sub.f32 f600, f498, f569;
add.f32 f966, f990, f571;
sub.f32 f601, f990, f571;
add.f32 f602, f502, f574;
sub.f32 f604, f502, f574;
add.f32 f965, f989, f576;
sub.f32 f605, f989, f576;
add.f32 f606, f506, f579;
sub.f32 f608, f506, f579;
add.f32 f964, f988, f581;
sub.f32 f609, f988, f581;
add.f32 f610, f496, f554;
sub.f32 f612, f496, f554;
sub.f32 f963, f497, f553;
add.f32 f613, f497, f553;
add.f32 f614, f500, f584;
sub.f32 f616, f500, f584;
add.f32 f962, f501, f586;
sub.f32 f617, f501, f586;
add.f32 f618, f504, f589;
sub.f32 f620, f504, f589;
add.f32 f961, f505, f590;
sub.f32 f621, f505, f590;
add.f32 f622, f508, f593;
sub.f32 f624, f508, f593;
add.f32 f960, f509, f595;
sub.f32 f625, f509, f595;
and.b32 r14, r34, 48;
bfe.u32 r15, r34, 4, 2;
mul.wide.u32 rd6, r15, 8;
mov.u64 rd7, %34;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f626, f627}, [rd8];
mul.f32 f631, f627, f966;
mul.f32 f632, f626, f966;
mul.f32 f634, f627, f627;
mul.f32 f959, f626, f626;
sub.f32 f635, f959, f634;
mul.f32 f636, f627, f626;
fma.rn.f32 f637, f627, f626, f636;
mul.f32 f639, f637, f965;
mul.f32 f640, f635, f965;
mul.f32 f957, f626, f635;
mul.f32 f958, f627, f637;
sub.f32 f643, f957, f958;
mul.f32 f956, f635, f602;
mul.f32 f644, f626, f637;
fma.rn.f32 f645, f627, f635, f644;
mul.f32 f647, f645, f964;
mul.f32 f648, f643, f964;
mul.f32 f650, f627, f645;
mul.f32 f955, f626, f643;
sub.f32 f651, f955, f650;
mul.f32 f954, f643, f606;
mul.f32 f652, f626, f645;
fma.rn.f32 f653, f627, f643, f652;
mul.f32 f655, f653, f963;
mul.f32 f656, f651, f963;
mul.f32 f658, f627, f653;
mul.f32 f953, f626, f651;
sub.f32 f659, f953, f658;
mul.f32 f952, f651, f610;
mul.f32 f660, f626, f653;
fma.rn.f32 f661, f627, f651, f660;
mul.f32 f663, f661, f962;
mul.f32 f664, f659, f962;
mul.f32 f950, f626, f659;
mul.f32 f951, f627, f661;
sub.f32 f667, f950, f951;
mul.f32 f949, f659, f614;
mul.f32 f668, f626, f661;
fma.rn.f32 f669, f627, f659, f668;
mul.f32 f671, f669, f961;
mul.f32 f672, f667, f961;
mul.f32 f674, f627, f669;
mul.f32 f948, f626, f667;
sub.f32 f675, f948, f674;
mul.f32 f947, f667, f618;
mul.f32 f676, f626, f669;
fma.rn.f32 f677, f627, f667, f676;
mul.f32 f679, f677, f960;
mul.f32 f680, f675, f960;
mul.f32 f682, f627, f677;
mul.f32 f946, f626, f675;
sub.f32 f683, f946, f682;
mul.f32 f945, f675, f622;
mul.f32 f684, f626, f677;
fma.rn.f32 f685, f627, f675, f684;
sub.f32 f944, f991, f977;
mul.f32 f687, f685, f944;
mul.f32 f688, f683, f944;
mul.f32 f942, f626, f683;
mul.f32 f943, f627, f685;
sub.f32 f691, f942, f943;
sub.f32 f941, f494, f551;
mul.f32 f940, f683, f941;
mul.f32 f692, f626, f685;
fma.rn.f32 f693, f627, f683, f692;
mul.f32 f695, f693, f601;
mul.f32 f696, f691, f601;
mul.f32 f698, f627, f693;
mul.f32 f939, f626, f691;
sub.f32 f699, f939, f698;
mul.f32 f938, f691, f600;
mul.f32 f700, f626, f693;
fma.rn.f32 f701, f627, f691, f700;
mul.f32 f703, f701, f605;
mul.f32 f704, f699, f605;
mul.f32 f936, f626, f699;
mul.f32 f937, f627, f701;
sub.f32 f707, f936, f937;
mul.f32 f935, f699, f604;
mul.f32 f708, f626, f701;
fma.rn.f32 f709, f627, f699, f708;
mul.f32 f711, f709, f609;
mul.f32 f712, f707, f609;
mul.f32 f714, f627, f709;
mul.f32 f934, f626, f707;
sub.f32 f715, f934, f714;
mul.f32 f933, f707, f608;
mul.f32 f716, f626, f709;
fma.rn.f32 f717, f627, f707, f716;
mul.f32 f719, f717, f613;
mul.f32 f720, f715, f613;
mul.f32 f722, f627, f717;
mul.f32 f932, f626, f715;
sub.f32 f723, f932, f722;
mul.f32 f931, f715, f612;
mul.f32 f724, f626, f717;
fma.rn.f32 f725, f627, f715, f724;
mul.f32 f727, f725, f617;
mul.f32 f728, f723, f617;
mul.f32 f929, f626, f723;
mul.f32 f930, f627, f725;
sub.f32 f731, f929, f930;
mul.f32 f928, f723, f616;
mul.f32 f732, f626, f725;
fma.rn.f32 f733, f627, f723, f732;
mul.f32 f735, f733, f621;
mul.f32 f736, f731, f621;
mul.f32 f738, f627, f733;
mul.f32 f927, f626, f731;
sub.f32 f739, f927, f738;
mul.f32 f926, f626, f598;
mul.f32 f740, f626, f733;
mul.f32 f925, f731, f620;
fma.rn.f32 f741, f627, f731, f740;
mul.f32 f742, f739, f624;
mul.f32 f743, f741, f625;
mul.f32 f744, f739, f625;
mov.u32 r24, %tid.x;
shl.b32 r23, r24, 3;
and.b32 r16, r23, 120;
add.s32 r17, r9, r16;
mov.u32 r26, %tid.x;
shl.b32 r25, r26, 7;
barrier.sync 0;
and.b32 r18, r25, 6144;
add.s32 r19, r17, r18;
mov.u32 r28, %tid.x;
and.b32 r27, r28, 48;
add.f32 f745, f991, f977;
sub.f32 f1092, f494, f551;
add.f32 f746, f494, f551;
st.shared.v2.f32 [r19], {f746, f745};
mov.u32 r31, %tid.x;
and.b32 r30, r31, 48;
fma.rn.f32 f747, f627, f598, f632;
sub.f32 f748, f926, f631;
st.shared.v2.f32 [r19+128], {f748, f747};
fma.rn.f32 f749, f637, f602, f640;
sub.f32 f750, f956, f639;
st.shared.v2.f32 [r19+256], {f750, f749};
fma.rn.f32 f751, f645, f606, f648;
sub.f32 f752, f954, f647;
st.shared.v2.f32 [r19+384], {f752, f751};
fma.rn.f32 f753, f653, f610, f656;
sub.f32 f754, f952, f655;
st.shared.v2.f32 [r19+512], {f754, f753};
sub.f32 f755, f949, f663;
fma.rn.f32 f756, f661, f614, f664;
st.shared.v2.f32 [r19+640], {f755, f756};
fma.rn.f32 f757, f669, f618, f672;
sub.f32 f758, f947, f671;
st.shared.v2.f32 [r19+768], {f758, f757};
fma.rn.f32 f759, f677, f622, f680;
sub.f32 f760, f945, f679;
st.shared.v2.f32 [r19+896], {f760, f759};
fma.rn.f32 f761, f685, f1092, f688;
sub.f32 f762, f940, f687;
st.shared.v2.f32 [r19+1024], {f762, f761};
fma.rn.f32 f763, f693, f600, f696;
sub.f32 f764, f938, f695;
st.shared.v2.f32 [r19+1152], {f764, f763};
fma.rn.f32 f765, f701, f604, f704;
sub.f32 f766, f935, f703;
st.shared.v2.f32 [r19+1280], {f766, f765};
fma.rn.f32 f767, f709, f608, f712;
sub.f32 f768, f933, f711;
st.shared.v2.f32 [r19+1408], {f768, f767};
fma.rn.f32 f769, f717, f612, f720;
sub.f32 f770, f931, f719;
st.shared.v2.f32 [r19+1536], {f770, f769};
fma.rn.f32 f771, f725, f616, f728;
sub.f32 f772, f928, f727;
st.shared.v2.f32 [r19+1664], {f772, f771};
fma.rn.f32 f773, f733, f620, f736;
sub.f32 f774, f925, f735;
st.shared.v2.f32 [r19+1792], {f774, f773};
fma.rn.f32 f775, f741, f624, f744;
sub.f32 f776, f742, f743;
st.shared.v2.f32 [r19+1920], {f776, f775};
barrier.sync 0;
mad.lo.s32 r20, r30, -120, r19;
ld.shared.v2.f32 {f777, f778}, [r20];
ld.shared.v2.f32 {f781, f782}, [r20+512];
ld.shared.v2.f32 {f785, f786}, [r20+1024];
ld.shared.v2.f32 {f789, f790}, [r20+1536];
ld.shared.v2.f32 {f793, f794}, [r20+2048];
ld.shared.v2.f32 {f797, f798}, [r20+2560];
ld.shared.v2.f32 {f801, f802}, [r20+3072];
ld.shared.v2.f32 {f805, f806}, [r20+3584];
ld.shared.v2.f32 {f809, f810}, [r20+4096];
ld.shared.v2.f32 {f813, f814}, [r20+4608];
ld.shared.v2.f32 {f817, f818}, [r20+5120];
ld.shared.v2.f32 {f821, f822}, [r20+5632];
ld.shared.v2.f32 {f825, f826}, [r20+6144];
ld.shared.v2.f32 {f829, f830}, [r20+6656];
ld.shared.v2.f32 {f833, f834}, [r20+7168];
ld.shared.v2.f32 {f837, f838}, [r20+7680];
add.f32 f841, f777, f809;
sub.f32 f843, f777, f809;
add.f32 f924, f778, f810;
sub.f32 f844, f778, f810;
add.f32 f845, f793, f825;
sub.f32 f847, f793, f825;
add.f32 f923, f794, f826;
sub.f32 f848, f794, f826;
add.f32 f849, f781, f813;
sub.f32 f851, f781, f813;
add.f32 f922, f782, f814;
sub.f32 f852, f782, f814;
add.f32 f853, f797, f829;
sub.f32 f855, f797, f829;
add.f32 f921, f798, f830;
sub.f32 f856, f798, f830;
add.f32 f857, f785, f817;
sub.f32 f859, f785, f817;
add.f32 f920, f786, f818;
sub.f32 f860, f786, f818;
add.f32 f861, f801, f833;
sub.f32 f863, f801, f833;
add.f32 f919, f802, f834;
sub.f32 f864, f802, f834;
add.f32 f865, f789, f821;
sub.f32 f867, f789, f821;
add.f32 f918, f790, f822;
sub.f32 f868, f790, f822;
add.f32 f869, f805, f837;
sub.f32 f871, f805, f837;
add.f32 f917, f806, f838;
sub.f32 f872, f806, f838;
add.f32 %1, f924, f923;
add.f32 %0, f841, f845;
add.f32 %2, f849, f853;
add.f32 %3, f922, f921;
add.f32 %4, f857, f861;
add.f32 %5, f920, f919;
add.f32 %6, f865, f869;
add.f32 %7, f918, f917;
add.f32 %8, f843, f848;
sub.f32 %9, f844, f847;
sub.f32 %11, f852, f855;
add.f32 %10, f851, f856;
sub.f32 %13, f860, f863;
add.f32 %12, f859, f864;
sub.f32 %15, f868, f871;
add.f32 %14, f867, f872;
sub.f32 %17, f924, f923;
sub.f32 %16, f841, f845;
sub.f32 %19, f922, f921;
sub.f32 %18, f849, f853;
sub.f32 %21, f920, f919;
sub.f32 %20, f857, f861;
sub.f32 %23, f918, f917;
sub.f32 %22, f865, f869;
add.f32 %25, f844, f847;
sub.f32 %24, f843, f848;
add.f32 %27, f852, f855;
sub.f32 %26, f851, f856;
add.f32 %29, f860, f863;
sub.f32 %28, f859, f864;
add.f32 %31, f868, f871;
sub.f32 %30, f867, f872;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_1024), "l"(lut_sp_16_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[10].y), "f"(rmem[2].y), "f"(rmem[14].y), "f"(rmem[1].y), "f"(rmem[5].y), "f"(rmem[13].y), "f"(rmem[11].y), "f"(rmem[7].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<87, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<277>;
.reg .b32 r<35>;
.reg .b64 rd<15>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 13;
mov.u32 r3, %8;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f32 f17, %13, %18;
add.f32 f18, %14, %20;
sub.f32 f19, %13, %18;
sub.f32 f20, %14, %20;
add.f32 f21, %15, %21;
add.f32 f22, %17, %22;
sub.f32 f23, %15, %21;
sub.f32 f24, %17, %22;
sub.f32 f25, f17, f21;
sub.f32 f26, f18, f22;
add.f32 f27, f19, f24;
sub.f32 f28, f20, f23;
sub.f32 f29, f19, f24;
add.f32 f30, f20, f23;
and.b32 r6, r5, 255;
shl.b32 r7, r5, 5;
and.b32 r8, r7, -8192;
add.s32 r9, r4, r8;
shl.b32 r10, r5, 3;
cvt.u64.u32 rd2, r10;
and.b64 rd3, rd2, 2040;
mov.u64 rd4, %9;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f31, f32}, [rd5];
mul.f32 f35, f31, f27;
mul.f32 f36, f32, f28;
mul.f32 f37, f31, f28;
mul.f32 f38, f31, f31;
mul.f32 f39, f32, f32;
sub.f32 f40, f38, f39;
mul.f32 f41, f32, f31;
fma.rn.f32 f42, f32, f31, f41;
mul.f32 f43, f40, f25;
mul.f32 f44, f42, f26;
mul.f32 f45, f40, f26;
mul.f32 f46, f31, f40;
mul.f32 f47, f32, f42;
sub.f32 f48, f46, f47;
mul.f32 f49, f31, f42;
fma.rn.f32 f50, f32, f40, f49;
mul.f32 f51, f48, f29;
mul.f32 f52, f50, f30;
mul.f32 f53, f48, f30;
barrier.sync 0;
and.b32 r11, r7, 8160;
add.s32 r12, r9, r11;
add.f32 f54, f18, f22;
add.f32 f55, f17, f21;
fma.rn.f32 f56, f32, f27, f37;
sub.f32 f57, f35, f36;
st.shared.v4.f32 [r12], {f55, f54, f57, f56};
sub.f32 f58, f43, f44;
fma.rn.f32 f59, f42, f25, f45;
fma.rn.f32 f60, f50, f29, f53;
sub.f32 f61, f51, f52;
st.shared.v4.f32 [r12+16], {f58, f59, f61, f60};
barrier.sync 0;
mad.lo.s32 r13, r6, -24, r12;
ld.shared.v2.f32 {f62, f63}, [r13];
ld.shared.v2.f32 {f66, f67}, [r13+2048];
ld.shared.v2.f32 {f70, f71}, [r13+4096];
ld.shared.v2.f32 {f74, f75}, [r13+6144];
add.f32 f78, f62, f70;
add.f32 f79, f63, f71;
sub.f32 f80, f62, f70;
sub.f32 f81, f63, f71;
add.f32 f82, f66, f74;
add.f32 f83, f67, f75;
sub.f32 f84, f66, f74;
sub.f32 f85, f67, f75;
sub.f32 f86, f78, f82;
sub.f32 f87, f79, f83;
add.f32 f88, f80, f85;
sub.f32 f89, f81, f84;
sub.f32 f90, f80, f85;
add.f32 f91, f81, f84;
and.b32 r14, r5, 252;
bfe.u32 r15, r5, 2, 6;
mul.wide.u32 rd6, r15, 8;
mov.u64 rd7, %10;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f92, f93}, [rd8];
mul.f32 f96, f92, f88;
mul.f32 f97, f93, f89;
mul.f32 f98, f92, f89;
mul.f32 f99, f92, f92;
mul.f32 f100, f93, f93;
sub.f32 f101, f99, f100;
mul.f32 f102, f93, f92;
fma.rn.f32 f103, f93, f92, f102;
mul.f32 f104, f101, f86;
mul.f32 f105, f103, f87;
mul.f32 f106, f101, f87;
mul.f32 f107, f92, f101;
mul.f32 f108, f93, f103;
sub.f32 f109, f107, f108;
mul.f32 f110, f92, f103;
fma.rn.f32 f111, f93, f101, f110;
mul.f32 f112, f109, f90;
mul.f32 f113, f111, f91;
mul.f32 f114, f109, f91;
and.b32 r16, r10, 24;
add.s32 r17, r9, r16;
barrier.sync 0;
and.b32 r18, r7, 8064;
add.s32 r19, r17, r18;
add.f32 f115, f79, f83;
add.f32 f116, f78, f82;
st.shared.v2.f32 [r19], {f116, f115};
fma.rn.f32 f117, f93, f88, f98;
sub.f32 f118, f96, f97;
st.shared.v2.f32 [r19+32], {f118, f117};
fma.rn.f32 f119, f103, f86, f106;
sub.f32 f120, f104, f105;
st.shared.v2.f32 [r19+64], {f120, f119};
sub.f32 f121, f112, f113;
fma.rn.f32 f122, f111, f90, f114;
st.shared.v2.f32 [r19+96], {f121, f122};
barrier.sync 0;
mad.lo.s32 r20, r14, -24, r19;
ld.shared.v2.f32 {f123, f124}, [r20];
ld.shared.v2.f32 {f127, f128}, [r20+2048];
ld.shared.v2.f32 {f131, f132}, [r20+4096];
ld.shared.v2.f32 {f135, f136}, [r20+6144];
add.f32 f139, f123, f131;
add.f32 f140, f124, f132;
sub.f32 f141, f123, f131;
sub.f32 f142, f124, f132;
add.f32 f143, f127, f135;
add.f32 f144, f128, f136;
sub.f32 f145, f127, f135;
sub.f32 f146, f128, f136;
sub.f32 f147, f139, f143;
sub.f32 f148, f140, f144;
add.f32 f149, f141, f146;
sub.f32 f150, f142, f145;
sub.f32 f151, f141, f146;
add.f32 f152, f142, f145;
and.b32 r21, r5, 240;
bfe.u32 r22, r5, 4, 4;
mul.wide.u32 rd9, r22, 8;
mov.u64 rd10, %11;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f153, f154}, [rd11];
mul.f32 f157, f153, f149;
mul.f32 f158, f154, f150;
mul.f32 f159, f153, f150;
mul.f32 f160, f153, f153;
mul.f32 f161, f154, f154;
sub.f32 f162, f160, f161;
mul.f32 f163, f154, f153;
fma.rn.f32 f164, f154, f153, f163;
mul.f32 f165, f162, f147;
mul.f32 f166, f164, f148;
mul.f32 f167, f162, f148;
mul.f32 f168, f153, f162;
mul.f32 f169, f154, f164;
sub.f32 f170, f168, f169;
mul.f32 f171, f153, f164;
fma.rn.f32 f172, f154, f162, f171;
mul.f32 f173, f170, f151;
mul.f32 f174, f172, f152;
mul.f32 f175, f170, f152;
and.b32 r23, r10, 120;
add.s32 r24, r9, r23;
barrier.sync 0;
and.b32 r25, r7, 7680;
add.s32 r26, r24, r25;
add.f32 f176, f140, f144;
add.f32 f177, f139, f143;
st.shared.v2.f32 [r26], {f177, f176};
fma.rn.f32 f178, f154, f149, f159;
sub.f32 f179, f157, f158;
st.shared.v2.f32 [r26+128], {f179, f178};
fma.rn.f32 f180, f164, f147, f167;
sub.f32 f181, f165, f166;
st.shared.v2.f32 [r26+256], {f181, f180};
sub.f32 f182, f173, f174;
fma.rn.f32 f183, f172, f151, f175;
st.shared.v2.f32 [r26+384], {f182, f183};
barrier.sync 0;
mad.lo.s32 r27, r21, -24, r26;
ld.shared.v2.f32 {f184, f185}, [r27];
ld.shared.v2.f32 {f188, f189}, [r27+2048];
ld.shared.v2.f32 {f192, f193}, [r27+4096];
ld.shared.v2.f32 {f196, f197}, [r27+6144];
add.f32 f200, f184, f192;
add.f32 f201, f185, f193;
sub.f32 f202, f184, f192;
sub.f32 f203, f185, f193;
add.f32 f204, f188, f196;
add.f32 f205, f189, f197;
sub.f32 f206, f188, f196;
sub.f32 f207, f189, f197;
sub.f32 f208, f200, f204;
sub.f32 f209, f201, f205;
add.f32 f210, f202, f207;
sub.f32 f211, f203, f206;
sub.f32 f212, f202, f207;
add.f32 f213, f203, f206;
and.b32 r28, r5, 192;
bfe.u32 r29, r5, 6, 2;
mul.wide.u32 rd12, r29, 8;
mov.u64 rd13, %12;
add.s64 rd14, rd13, rd12;
ld.global.v2.f32 {f214, f215}, [rd14];
mul.f32 f218, f214, f210;
mul.f32 f219, f215, f211;
mul.f32 f220, f214, f211;
mul.f32 f221, f214, f214;
mul.f32 f222, f215, f215;
sub.f32 f223, f221, f222;
mul.f32 f224, f215, f214;
fma.rn.f32 f225, f215, f214, f224;
mul.f32 f226, f223, f208;
mul.f32 f227, f225, f209;
mul.f32 f228, f223, f209;
mul.f32 f229, f214, f223;
mul.f32 f230, f215, f225;
sub.f32 f231, f229, f230;
mul.f32 f232, f214, f225;
fma.rn.f32 f233, f215, f223, f232;
mul.f32 f234, f231, f212;
mul.f32 f235, f233, f213;
mul.f32 f236, f231, f213;
and.b32 r30, r10, 504;
add.s32 r31, r9, r30;
barrier.sync 0;
and.b32 r32, r7, 6144;
add.s32 r33, r31, r32;
add.f32 f237, f201, f205;
add.f32 f238, f200, f204;
st.shared.v2.f32 [r33], {f238, f237};
fma.rn.f32 f239, f215, f210, f220;
sub.f32 f240, f218, f219;
st.shared.v2.f32 [r33+512], {f240, f239};
fma.rn.f32 f241, f225, f208, f228;
sub.f32 f242, f226, f227;
st.shared.v2.f32 [r33+1024], {f242, f241};
sub.f32 f243, f234, f235;
fma.rn.f32 f244, f233, f212, f236;
st.shared.v2.f32 [r33+1536], {f243, f244};
barrier.sync 0;
mad.lo.s32 r34, r28, -24, r33;
ld.shared.v2.f32 {f245, f246}, [r34];
ld.shared.v2.f32 {f249, f250}, [r34+2048];
ld.shared.v2.f32 {f253, f254}, [r34+4096];
ld.shared.v2.f32 {f257, f258}, [r34+6144];
add.f32 f261, f245, f253;
add.f32 f262, f246, f254;
sub.f32 f263, f245, f253;
sub.f32 f264, f246, f254;
add.f32 f265, f249, f257;
add.f32 f266, f250, f258;
sub.f32 f267, f249, f257;
sub.f32 f268, f250, f258;
add.f32 %1, f262, f266;
add.f32 %0, f261, f265;
sub.f32 %3, f264, f267;
add.f32 %2, f263, f268;
sub.f32 %5, f262, f266;
sub.f32 %4, f261, f265;
add.f32 %7, f264, f267;
sub.f32 %6, f263, f268;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_1024), "l"(lut_sp_4_256), "l"(lut_sp_4_64), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<88, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<245>;
.reg .b32 r<36>;
.reg .b64 rd<15>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 12;
mov.u32 r3, %8;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f32 f17, %13, %18;
add.f32 f18, %14, %20;
sub.f32 f19, %13, %18;
sub.f32 f20, %14, %20;
add.f32 f21, %15, %21;
add.f32 f22, %17, %22;
sub.f32 f23, %15, %21;
sub.f32 f24, %17, %22;
add.f32 f25, f17, f21;
add.f32 f26, f18, f22;
sub.f32 f27, f17, f21;
sub.f32 f28, f18, f22;
add.f32 f29, f19, f24;
sub.f32 f30, f20, f23;
sub.f32 f31, f19, f24;
add.f32 f32, f20, f23;
and.b32 r6, r5, 255;
shl.b32 r7, r5, 3;
cvt.u64.u32 rd2, r7;
and.b64 rd3, rd2, 2040;
mov.u64 rd4, %9;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f33, f34}, [rd5];
mul.f32 f37, f33, f29;
mul.f32 f38, f34, f30;
sub.f32 f39, f37, f38;
mul.f32 f40, f33, f30;
fma.rn.f32 f41, f34, f29, f40;
mul.f32 f42, f33, f33;
mul.f32 f43, f34, f34;
sub.f32 f44, f42, f43;
mul.f32 f45, f34, f33;
fma.rn.f32 f46, f34, f33, f45;
mul.f32 f47, f44, f27;
mul.f32 f48, f46, f28;
sub.f32 f49, f47, f48;
mul.f32 f50, f44, f28;
fma.rn.f32 f51, f46, f27, f50;
mul.f32 f52, f33, f44;
mul.f32 f53, f34, f46;
sub.f32 f54, f52, f53;
mul.f32 f55, f33, f46;
fma.rn.f32 f56, f34, f44, f55;
mul.f32 f57, f54, f31;
mul.f32 f58, f56, f32;
sub.f32 f59, f57, f58;
mul.f32 f60, f54, f32;
fma.rn.f32 f61, f56, f31, f60;
shl.b32 r8, r5, 4;
and.b32 r9, r8, -4096;
add.s32 r10, r4, r9;
barrier.sync 0;
and.b32 r11, r8, 4080;
add.s32 r12, r10, r11;
st.shared.v4.f32 [r12], {f25, f39, f49, f59};
barrier.sync 0;
mad.lo.s32 r13, r6, -12, r12;
ld.shared.f32 f62, [r13];
ld.shared.f32 f63, [r13+1024];
ld.shared.f32 f64, [r13+2048];
ld.shared.f32 f65, [r13+3072];
barrier.sync 0;
st.shared.v4.f32 [r12], {f26, f41, f51, f61};
barrier.sync 0;
ld.shared.f32 f66, [r13];
ld.shared.f32 f67, [r13+1024];
ld.shared.f32 f68, [r13+2048];
ld.shared.f32 f69, [r13+3072];
add.f32 f70, f62, f64;
add.f32 f71, f66, f68;
sub.f32 f72, f62, f64;
sub.f32 f73, f66, f68;
add.f32 f74, f63, f65;
add.f32 f75, f67, f69;
sub.f32 f76, f63, f65;
sub.f32 f77, f67, f69;
add.f32 f78, f70, f74;
add.f32 f79, f71, f75;
sub.f32 f80, f70, f74;
sub.f32 f81, f71, f75;
add.f32 f82, f72, f77;
sub.f32 f83, f73, f76;
sub.f32 f84, f72, f77;
add.f32 f85, f73, f76;
and.b32 r14, r5, 252;
bfe.u32 r15, r5, 2, 6;
mul.wide.u32 rd6, r15, 8;
mov.u64 rd7, %10;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f86, f87}, [rd8];
mul.f32 f90, f86, f82;
mul.f32 f91, f87, f83;
sub.f32 f92, f90, f91;
mul.f32 f93, f86, f83;
fma.rn.f32 f94, f87, f82, f93;
mul.f32 f95, f86, f86;
mul.f32 f96, f87, f87;
sub.f32 f97, f95, f96;
mul.f32 f98, f87, f86;
fma.rn.f32 f99, f87, f86, f98;
mul.f32 f100, f97, f80;
mul.f32 f101, f99, f81;
sub.f32 f102, f100, f101;
mul.f32 f103, f97, f81;
fma.rn.f32 f104, f99, f80, f103;
mul.f32 f105, f86, f97;
mul.f32 f106, f87, f99;
sub.f32 f107, f105, f106;
mul.f32 f108, f86, f99;
fma.rn.f32 f109, f87, f97, f108;
mul.f32 f110, f107, f84;
mul.f32 f111, f109, f85;
sub.f32 f112, f110, f111;
mul.f32 f113, f107, f85;
fma.rn.f32 f114, f109, f84, f113;
shl.b32 r16, r5, 2;
and.b32 r17, r16, 12;
add.s32 r18, r10, r17;
barrier.sync 0;
and.b32 r19, r8, 4032;
add.s32 r20, r18, r19;
st.shared.f32 [r20], f78;
st.shared.f32 [r20+16], f92;
st.shared.f32 [r20+32], f102;
st.shared.f32 [r20+48], f112;
barrier.sync 0;
mad.lo.s32 r21, r14, -12, r20;
ld.shared.f32 f115, [r21];
ld.shared.f32 f116, [r21+1024];
ld.shared.f32 f117, [r21+2048];
ld.shared.f32 f118, [r21+3072];
barrier.sync 0;
st.shared.f32 [r20], f79;
st.shared.f32 [r20+16], f94;
st.shared.f32 [r20+32], f104;
st.shared.f32 [r20+48], f114;
barrier.sync 0;
ld.shared.f32 f119, [r21];
ld.shared.f32 f120, [r21+1024];
ld.shared.f32 f121, [r21+2048];
ld.shared.f32 f122, [r21+3072];
add.f32 f123, f115, f117;
add.f32 f124, f119, f121;
sub.f32 f125, f115, f117;
sub.f32 f126, f119, f121;
add.f32 f127, f116, f118;
add.f32 f128, f120, f122;
sub.f32 f129, f116, f118;
sub.f32 f130, f120, f122;
add.f32 f131, f123, f127;
add.f32 f132, f124, f128;
sub.f32 f133, f123, f127;
sub.f32 f134, f124, f128;
add.f32 f135, f125, f130;
sub.f32 f136, f126, f129;
sub.f32 f137, f125, f130;
add.f32 f138, f126, f129;
and.b32 r22, r5, 240;
bfe.u32 r23, r5, 4, 4;
mul.wide.u32 rd9, r23, 8;
mov.u64 rd10, %11;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f139, f140}, [rd11];
mul.f32 f143, f139, f135;
mul.f32 f144, f140, f136;
sub.f32 f145, f143, f144;
mul.f32 f146, f139, f136;
fma.rn.f32 f147, f140, f135, f146;
mul.f32 f148, f139, f139;
mul.f32 f149, f140, f140;
sub.f32 f150, f148, f149;
mul.f32 f151, f140, f139;
fma.rn.f32 f152, f140, f139, f151;
mul.f32 f153, f150, f133;
mul.f32 f154, f152, f134;
sub.f32 f155, f153, f154;
mul.f32 f156, f150, f134;
fma.rn.f32 f157, f152, f133, f156;
mul.f32 f158, f139, f150;
mul.f32 f159, f140, f152;
sub.f32 f160, f158, f159;
mul.f32 f161, f139, f152;
fma.rn.f32 f162, f140, f150, f161;
mul.f32 f163, f160, f137;
mul.f32 f164, f162, f138;
sub.f32 f165, f163, f164;
mul.f32 f166, f160, f138;
fma.rn.f32 f167, f162, f137, f166;
and.b32 r24, r16, 60;
add.s32 r25, r10, r24;
barrier.sync 0;
and.b32 r26, r8, 3840;
add.s32 r27, r25, r26;
st.shared.f32 [r27], f131;
st.shared.f32 [r27+64], f145;
st.shared.f32 [r27+128], f155;
st.shared.f32 [r27+192], f165;
barrier.sync 0;
mad.lo.s32 r28, r22, -12, r27;
ld.shared.f32 f168, [r28];
ld.shared.f32 f169, [r28+1024];
ld.shared.f32 f170, [r28+2048];
ld.shared.f32 f171, [r28+3072];
barrier.sync 0;
st.shared.f32 [r27], f132;
st.shared.f32 [r27+64], f147;
st.shared.f32 [r27+128], f157;
st.shared.f32 [r27+192], f167;
barrier.sync 0;
ld.shared.f32 f172, [r28];
ld.shared.f32 f173, [r28+1024];
ld.shared.f32 f174, [r28+2048];
ld.shared.f32 f175, [r28+3072];
add.f32 f176, f168, f170;
add.f32 f177, f172, f174;
sub.f32 f178, f168, f170;
sub.f32 f179, f172, f174;
add.f32 f180, f169, f171;
add.f32 f181, f173, f175;
sub.f32 f182, f169, f171;
sub.f32 f183, f173, f175;
add.f32 f184, f176, f180;
add.f32 f185, f177, f181;
sub.f32 f186, f176, f180;
sub.f32 f187, f177, f181;
add.f32 f188, f178, f183;
sub.f32 f189, f179, f182;
sub.f32 f190, f178, f183;
add.f32 f191, f179, f182;
and.b32 r29, r5, 192;
bfe.u32 r30, r5, 6, 2;
mul.wide.u32 rd12, r30, 8;
mov.u64 rd13, %12;
add.s64 rd14, rd13, rd12;
ld.global.v2.f32 {f192, f193}, [rd14];
mul.f32 f196, f192, f188;
mul.f32 f197, f193, f189;
sub.f32 f198, f196, f197;
mul.f32 f199, f192, f189;
fma.rn.f32 f200, f193, f188, f199;
mul.f32 f201, f192, f192;
mul.f32 f202, f193, f193;
sub.f32 f203, f201, f202;
mul.f32 f204, f193, f192;
fma.rn.f32 f205, f193, f192, f204;
mul.f32 f206, f203, f186;
mul.f32 f207, f205, f187;
sub.f32 f208, f206, f207;
mul.f32 f209, f203, f187;
fma.rn.f32 f210, f205, f186, f209;
mul.f32 f211, f192, f203;
mul.f32 f212, f193, f205;
sub.f32 f213, f211, f212;
mul.f32 f214, f192, f205;
fma.rn.f32 f215, f193, f203, f214;
mul.f32 f216, f213, f190;
mul.f32 f217, f215, f191;
sub.f32 f218, f216, f217;
mul.f32 f219, f213, f191;
fma.rn.f32 f220, f215, f190, f219;
and.b32 r31, r16, 252;
add.s32 r32, r10, r31;
barrier.sync 0;
and.b32 r33, r8, 3072;
add.s32 r34, r32, r33;
st.shared.f32 [r34], f184;
st.shared.f32 [r34+256], f198;
st.shared.f32 [r34+512], f208;
st.shared.f32 [r34+768], f218;
barrier.sync 0;
mad.lo.s32 r35, r29, -12, r34;
ld.shared.f32 f221, [r35];
ld.shared.f32 f222, [r35+1024];
ld.shared.f32 f223, [r35+2048];
ld.shared.f32 f224, [r35+3072];
barrier.sync 0;
st.shared.f32 [r34], f185;
st.shared.f32 [r34+256], f200;
st.shared.f32 [r34+512], f210;
st.shared.f32 [r34+768], f220;
barrier.sync 0;
ld.shared.f32 f225, [r35];
ld.shared.f32 f226, [r35+1024];
ld.shared.f32 f227, [r35+2048];
ld.shared.f32 f228, [r35+3072];
add.f32 f229, f221, f223;
add.f32 f230, f225, f227;
sub.f32 f231, f221, f223;
sub.f32 f232, f225, f227;
add.f32 f233, f222, f224;
add.f32 f234, f226, f228;
sub.f32 f235, f222, f224;
sub.f32 f236, f226, f228;
add.f32 %0, f229, f233;
add.f32 %1, f230, f234;
sub.f32 %3, f232, f235;
add.f32 %2, f231, f236;
sub.f32 %4, f229, f233;
sub.f32 %5, f230, f234;
add.f32 %7, f232, f235;
sub.f32 %6, f231, f236;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_1024), "l"(lut_sp_4_256), "l"(lut_sp_4_64), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<89, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<202>;
.reg .b32 r<70>;
.reg .b64 rd<30>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 13;
mov.u32 r3, %4;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
sub.f32 f9, %14, %16;
sub.f32 f10, %15, %17;
shl.b32 r6, r5, 4;
and.b32 r7, r6, -8192;
add.s32 r8, r4, r7;
shl.b32 r9, r5, 3;
cvt.u64.u32 rd2, r9;
and.b64 rd3, rd2, 4088;
mov.u64 rd4, %5;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f11, f12}, [rd5];
mul.f32 f15, f11, f9;
mul.f32 f16, f12, f10;
mul.f32 f17, f11, f10;
barrier.sync 0;
and.b32 r10, r6, 8176;
add.s32 r11, r8, r10;
add.f32 f18, %15, %17;
add.f32 f19, %14, %16;
st.shared.v2.f32 [r11], {f19, f18};
sub.f32 f20, f15, f16;
fma.rn.f32 f21, f12, f9, f17;
st.shared.v2.f32 [r11+8], {f20, f21};
barrier.sync 0;
and.b32 r12, r9, 4088;
sub.s32 r13, r11, r12;
ld.shared.v2.f32 {f22, f23}, [r13];
ld.shared.v2.f32 {f26, f27}, [r13+4096];
sub.f32 f30, f22, f26;
sub.f32 f31, f23, f27;
bfe.u32 r14, r5, 1, 8;
mul.wide.u32 rd6, r14, 8;
mov.u64 rd7, %6;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f32, f33}, [rd8];
mul.f32 f36, f32, f30;
mul.f32 f37, f33, f31;
mul.f32 f38, f32, f31;
and.b32 r15, r9, 8;
add.s32 r16, r8, r15;
barrier.sync 0;
and.b32 r17, r6, 8160;
add.s32 r18, r16, r17;
add.f32 f39, f23, f27;
add.f32 f40, f22, f26;
st.shared.v2.f32 [r18], {f40, f39};
fma.rn.f32 f41, f33, f30, f38;
sub.f32 f42, f36, f37;
st.shared.v2.f32 [r18+16], {f42, f41};
barrier.sync 0;
and.b32 r19, r9, 4080;
sub.s32 r20, r18, r19;
ld.shared.v2.f32 {f43, f44}, [r20];
ld.shared.v2.f32 {f47, f48}, [r20+4096];
sub.f32 f51, f43, f47;
sub.f32 f52, f44, f48;
bfe.u32 r21, r5, 2, 7;
mul.wide.u32 rd9, r21, 8;
mov.u64 rd10, %7;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f53, f54}, [rd11];
mul.f32 f57, f53, f51;
mul.f32 f58, f54, f52;
mul.f32 f59, f53, f52;
and.b32 r22, r9, 24;
add.s32 r23, r8, r22;
barrier.sync 0;
and.b32 r24, r6, 8128;
add.s32 r25, r23, r24;
add.f32 f60, f44, f48;
add.f32 f61, f43, f47;
st.shared.v2.f32 [r25], {f61, f60};
fma.rn.f32 f62, f54, f51, f59;
sub.f32 f63, f57, f58;
st.shared.v2.f32 [r25+32], {f63, f62};
barrier.sync 0;
and.b32 r26, r9, 4064;
sub.s32 r27, r25, r26;
ld.shared.v2.f32 {f64, f65}, [r27];
ld.shared.v2.f32 {f68, f69}, [r27+4096];
sub.f32 f72, f64, f68;
sub.f32 f73, f65, f69;
and.b32 r28, r5, 504;
cvt.u64.u32 rd12, r28;
mov.u64 rd13, %8;
add.s64 rd14, rd13, rd12;
ld.global.v2.f32 {f74, f75}, [rd14];
mul.f32 f78, f74, f72;
mul.f32 f79, f75, f73;
mul.f32 f80, f74, f73;
and.b32 r29, r9, 56;
add.s32 r30, r8, r29;
barrier.sync 0;
and.b32 r31, r6, 8064;
add.s32 r32, r30, r31;
add.f32 f81, f65, f69;
add.f32 f82, f64, f68;
st.shared.v2.f32 [r32], {f82, f81};
fma.rn.f32 f83, f75, f72, f80;
sub.f32 f84, f78, f79;
st.shared.v2.f32 [r32+64], {f84, f83};
barrier.sync 0;
and.b32 r33, r9, 4032;
sub.s32 r34, r32, r33;
ld.shared.v2.f32 {f85, f86}, [r34];
ld.shared.v2.f32 {f89, f90}, [r34+4096];
sub.f32 f93, f85, f89;
sub.f32 f94, f86, f90;
bfe.u32 r35, r5, 4, 5;
mul.wide.u32 rd15, r35, 8;
mov.u64 rd16, %9;
add.s64 rd17, rd16, rd15;
ld.global.v2.f32 {f95, f96}, [rd17];
mul.f32 f99, f95, f93;
mul.f32 f100, f96, f94;
mul.f32 f101, f95, f94;
and.b32 r36, r9, 120;
add.s32 r37, r8, r36;
barrier.sync 0;
and.b32 r38, r6, 7936;
add.s32 r39, r37, r38;
add.f32 f102, f86, f90;
add.f32 f103, f85, f89;
st.shared.v2.f32 [r39], {f103, f102};
fma.rn.f32 f104, f96, f93, f101;
sub.f32 f105, f99, f100;
st.shared.v2.f32 [r39+128], {f105, f104};
barrier.sync 0;
and.b32 r40, r9, 3968;
sub.s32 r41, r39, r40;
ld.shared.v2.f32 {f106, f107}, [r41];
ld.shared.v2.f32 {f110, f111}, [r41+4096];
sub.f32 f114, f106, f110;
sub.f32 f115, f107, f111;
bfe.u32 r42, r5, 5, 4;
mul.wide.u32 rd18, r42, 8;
mov.u64 rd19, %10;
add.s64 rd20, rd19, rd18;
ld.global.v2.f32 {f116, f117}, [rd20];
mul.f32 f120, f116, f114;
mul.f32 f121, f117, f115;
mul.f32 f122, f116, f115;
and.b32 r43, r9, 248;
add.s32 r44, r8, r43;
barrier.sync 0;
and.b32 r45, r6, 7680;
add.s32 r46, r44, r45;
add.f32 f123, f107, f111;
add.f32 f124, f106, f110;
st.shared.v2.f32 [r46], {f124, f123};
fma.rn.f32 f125, f117, f114, f122;
sub.f32 f126, f120, f121;
st.shared.v2.f32 [r46+256], {f126, f125};
barrier.sync 0;
and.b32 r47, r9, 3840;
sub.s32 r48, r46, r47;
ld.shared.v2.f32 {f127, f128}, [r48];
ld.shared.v2.f32 {f131, f132}, [r48+4096];
sub.f32 f135, f127, f131;
sub.f32 f136, f128, f132;
bfe.u32 r49, r5, 6, 3;
mul.wide.u32 rd21, r49, 8;
mov.u64 rd22, %11;
add.s64 rd23, rd22, rd21;
ld.global.v2.f32 {f137, f138}, [rd23];
mul.f32 f141, f137, f135;
mul.f32 f142, f138, f136;
mul.f32 f143, f137, f136;
and.b32 r50, r9, 504;
add.s32 r51, r8, r50;
barrier.sync 0;
and.b32 r52, r6, 7168;
add.s32 r53, r51, r52;
add.f32 f144, f128, f132;
add.f32 f145, f127, f131;
st.shared.v2.f32 [r53], {f145, f144};
fma.rn.f32 f146, f138, f135, f143;
sub.f32 f147, f141, f142;
st.shared.v2.f32 [r53+512], {f147, f146};
barrier.sync 0;
and.b32 r54, r9, 3584;
sub.s32 r55, r53, r54;
ld.shared.v2.f32 {f148, f149}, [r55];
ld.shared.v2.f32 {f152, f153}, [r55+4096];
sub.f32 f156, f148, f152;
sub.f32 f157, f149, f153;
bfe.u32 r56, r5, 7, 2;
mul.wide.u32 rd24, r56, 8;
mov.u64 rd25, %12;
add.s64 rd26, rd25, rd24;
ld.global.v2.f32 {f158, f159}, [rd26];
mul.f32 f162, f158, f156;
mul.f32 f163, f159, f157;
mul.f32 f164, f158, f157;
and.b32 r57, r9, 1016;
add.s32 r58, r8, r57;
barrier.sync 0;
and.b32 r59, r6, 6144;
add.s32 r60, r58, r59;
add.f32 f165, f149, f153;
add.f32 f166, f148, f152;
st.shared.v2.f32 [r60], {f166, f165};
fma.rn.f32 f167, f159, f156, f164;
sub.f32 f168, f162, f163;
st.shared.v2.f32 [r60+1024], {f168, f167};
barrier.sync 0;
and.b32 r61, r9, 3072;
sub.s32 r62, r60, r61;
ld.shared.v2.f32 {f169, f170}, [r62];
ld.shared.v2.f32 {f173, f174}, [r62+4096];
sub.f32 f177, f169, f173;
sub.f32 f178, f170, f174;
bfe.u32 r63, r5, 8, 1;
mul.wide.u32 rd27, r63, 8;
mov.u64 rd28, %13;
add.s64 rd29, rd28, rd27;
ld.global.v2.f32 {f179, f180}, [rd29];
mul.f32 f183, f179, f177;
mul.f32 f184, f180, f178;
mul.f32 f185, f179, f178;
and.b32 r64, r9, 2040;
add.s32 r65, r8, r64;
barrier.sync 0;
and.b32 r66, r6, 4096;
add.s32 r67, r65, r66;
add.f32 f186, f170, f174;
add.f32 f187, f169, f173;
st.shared.v2.f32 [r67], {f187, f186};
fma.rn.f32 f188, f180, f177, f185;
sub.f32 f189, f183, f184;
st.shared.v2.f32 [r67+2048], {f189, f188};
barrier.sync 0;
and.b32 r68, r9, 2048;
sub.s32 r69, r67, r68;
ld.shared.v2.f32 {f190, f191}, [r69];
ld.shared.v2.f32 {f194, f195}, [r69+4096];
add.f32 %1, f191, f195;
add.f32 %0, f190, f194;
sub.f32 %3, f191, f195;
sub.f32 %2, f190, f194;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_1024), "l"(lut_sp_2_512), "l"(lut_sp_2_256), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<90, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<166>;
.reg .b32 r<70>;
.reg .b64 rd<30>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 12;
mov.u32 r3, %4;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f32 f9, %14, %16;
add.f32 f10, %15, %17;
sub.f32 f11, %14, %16;
sub.f32 f12, %15, %17;
shl.b32 r6, r5, 3;
cvt.u64.u32 rd2, r6;
and.b64 rd3, rd2, 4088;
mov.u64 rd4, %5;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f13, f14}, [rd5];
mul.f32 f17, f13, f11;
mul.f32 f18, f14, f12;
sub.f32 f19, f17, f18;
mul.f32 f20, f13, f12;
fma.rn.f32 f21, f14, f11, f20;
and.b32 r7, r6, -4096;
add.s32 r8, r4, r7;
barrier.sync 0;
and.b32 r9, r6, 4088;
add.s32 r10, r8, r9;
st.shared.v2.f32 [r10], {f9, f19};
barrier.sync 0;
shl.b32 r11, r5, 2;
and.b32 r12, r11, 2044;
sub.s32 r13, r10, r12;
ld.shared.f32 f22, [r13];
ld.shared.f32 f23, [r13+2048];
barrier.sync 0;
st.shared.v2.f32 [r10], {f10, f21};
barrier.sync 0;
ld.shared.f32 f24, [r13];
ld.shared.f32 f25, [r13+2048];
add.f32 f26, f22, f23;
add.f32 f27, f24, f25;
sub.f32 f28, f22, f23;
sub.f32 f29, f24, f25;
bfe.u32 r14, r5, 1, 8;
mul.wide.u32 rd6, r14, 8;
mov.u64 rd7, %6;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f30, f31}, [rd8];
mul.f32 f34, f30, f28;
mul.f32 f35, f31, f29;
sub.f32 f36, f34, f35;
mul.f32 f37, f30, f29;
fma.rn.f32 f38, f31, f28, f37;
and.b32 r15, r11, 4;
add.s32 r16, r8, r15;
barrier.sync 0;
and.b32 r17, r6, 4080;
add.s32 r18, r16, r17;
st.shared.f32 [r18], f26;
st.shared.f32 [r18+8], f36;
barrier.sync 0;
and.b32 r19, r11, 2040;
sub.s32 r20, r18, r19;
ld.shared.f32 f39, [r20];
ld.shared.f32 f40, [r20+2048];
barrier.sync 0;
st.shared.f32 [r18], f27;
st.shared.f32 [r18+8], f38;
barrier.sync 0;
ld.shared.f32 f41, [r20];
ld.shared.f32 f42, [r20+2048];
add.f32 f43, f39, f40;
add.f32 f44, f41, f42;
sub.f32 f45, f39, f40;
sub.f32 f46, f41, f42;
bfe.u32 r21, r5, 2, 7;
mul.wide.u32 rd9, r21, 8;
mov.u64 rd10, %7;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f47, f48}, [rd11];
mul.f32 f51, f47, f45;
mul.f32 f52, f48, f46;
sub.f32 f53, f51, f52;
mul.f32 f54, f47, f46;
fma.rn.f32 f55, f48, f45, f54;
and.b32 r22, r11, 12;
add.s32 r23, r8, r22;
barrier.sync 0;
and.b32 r24, r6, 4064;
add.s32 r25, r23, r24;
st.shared.f32 [r25], f43;
st.shared.f32 [r25+16], f53;
barrier.sync 0;
and.b32 r26, r11, 2032;
sub.s32 r27, r25, r26;
ld.shared.f32 f56, [r27];
ld.shared.f32 f57, [r27+2048];
barrier.sync 0;
st.shared.f32 [r25], f44;
st.shared.f32 [r25+16], f55;
barrier.sync 0;
ld.shared.f32 f58, [r27];
ld.shared.f32 f59, [r27+2048];
add.f32 f60, f56, f57;
add.f32 f61, f58, f59;
sub.f32 f62, f56, f57;
sub.f32 f63, f58, f59;
and.b32 r28, r5, 504;
cvt.u64.u32 rd12, r28;
mov.u64 rd13, %8;
add.s64 rd14, rd13, rd12;
ld.global.v2.f32 {f64, f65}, [rd14];
mul.f32 f68, f64, f62;
mul.f32 f69, f65, f63;
sub.f32 f70, f68, f69;
mul.f32 f71, f64, f63;
fma.rn.f32 f72, f65, f62, f71;
and.b32 r29, r11, 28;
add.s32 r30, r8, r29;
barrier.sync 0;
and.b32 r31, r6, 4032;
add.s32 r32, r30, r31;
st.shared.f32 [r32], f60;
st.shared.f32 [r32+32], f70;
barrier.sync 0;
and.b32 r33, r11, 2016;
sub.s32 r34, r32, r33;
ld.shared.f32 f73, [r34];
ld.shared.f32 f74, [r34+2048];
barrier.sync 0;
st.shared.f32 [r32], f61;
st.shared.f32 [r32+32], f72;
barrier.sync 0;
ld.shared.f32 f75, [r34];
ld.shared.f32 f76, [r34+2048];
add.f32 f77, f73, f74;
add.f32 f78, f75, f76;
sub.f32 f79, f73, f74;
sub.f32 f80, f75, f76;
bfe.u32 r35, r5, 4, 5;
mul.wide.u32 rd15, r35, 8;
mov.u64 rd16, %9;
add.s64 rd17, rd16, rd15;
ld.global.v2.f32 {f81, f82}, [rd17];
mul.f32 f85, f81, f79;
mul.f32 f86, f82, f80;
sub.f32 f87, f85, f86;
mul.f32 f88, f81, f80;
fma.rn.f32 f89, f82, f79, f88;
and.b32 r36, r11, 60;
add.s32 r37, r8, r36;
barrier.sync 0;
and.b32 r38, r6, 3968;
add.s32 r39, r37, r38;
st.shared.f32 [r39], f77;
st.shared.f32 [r39+64], f87;
barrier.sync 0;
and.b32 r40, r11, 1984;
sub.s32 r41, r39, r40;
ld.shared.f32 f90, [r41];
ld.shared.f32 f91, [r41+2048];
barrier.sync 0;
st.shared.f32 [r39], f78;
st.shared.f32 [r39+64], f89;
barrier.sync 0;
ld.shared.f32 f92, [r41];
ld.shared.f32 f93, [r41+2048];
add.f32 f94, f90, f91;
add.f32 f95, f92, f93;
sub.f32 f96, f90, f91;
sub.f32 f97, f92, f93;
bfe.u32 r42, r5, 5, 4;
mul.wide.u32 rd18, r42, 8;
mov.u64 rd19, %10;
add.s64 rd20, rd19, rd18;
ld.global.v2.f32 {f98, f99}, [rd20];
mul.f32 f102, f98, f96;
mul.f32 f103, f99, f97;
sub.f32 f104, f102, f103;
mul.f32 f105, f98, f97;
fma.rn.f32 f106, f99, f96, f105;
and.b32 r43, r11, 124;
add.s32 r44, r8, r43;
barrier.sync 0;
and.b32 r45, r6, 3840;
add.s32 r46, r44, r45;
st.shared.f32 [r46], f94;
st.shared.f32 [r46+128], f104;
barrier.sync 0;
and.b32 r47, r11, 1920;
sub.s32 r48, r46, r47;
ld.shared.f32 f107, [r48];
ld.shared.f32 f108, [r48+2048];
barrier.sync 0;
st.shared.f32 [r46], f95;
st.shared.f32 [r46+128], f106;
barrier.sync 0;
ld.shared.f32 f109, [r48];
ld.shared.f32 f110, [r48+2048];
add.f32 f111, f107, f108;
add.f32 f112, f109, f110;
sub.f32 f113, f107, f108;
sub.f32 f114, f109, f110;
bfe.u32 r49, r5, 6, 3;
mul.wide.u32 rd21, r49, 8;
mov.u64 rd22, %11;
add.s64 rd23, rd22, rd21;
ld.global.v2.f32 {f115, f116}, [rd23];
mul.f32 f119, f115, f113;
mul.f32 f120, f116, f114;
sub.f32 f121, f119, f120;
mul.f32 f122, f115, f114;
fma.rn.f32 f123, f116, f113, f122;
and.b32 r50, r11, 252;
add.s32 r51, r8, r50;
barrier.sync 0;
and.b32 r52, r6, 3584;
add.s32 r53, r51, r52;
st.shared.f32 [r53], f111;
st.shared.f32 [r53+256], f121;
barrier.sync 0;
and.b32 r54, r11, 1792;
sub.s32 r55, r53, r54;
ld.shared.f32 f124, [r55];
ld.shared.f32 f125, [r55+2048];
barrier.sync 0;
st.shared.f32 [r53], f112;
st.shared.f32 [r53+256], f123;
barrier.sync 0;
ld.shared.f32 f126, [r55];
ld.shared.f32 f127, [r55+2048];
add.f32 f128, f124, f125;
add.f32 f129, f126, f127;
sub.f32 f130, f124, f125;
sub.f32 f131, f126, f127;
bfe.u32 r56, r5, 7, 2;
mul.wide.u32 rd24, r56, 8;
mov.u64 rd25, %12;
add.s64 rd26, rd25, rd24;
ld.global.v2.f32 {f132, f133}, [rd26];
mul.f32 f136, f132, f130;
mul.f32 f137, f133, f131;
sub.f32 f138, f136, f137;
mul.f32 f139, f132, f131;
fma.rn.f32 f140, f133, f130, f139;
and.b32 r57, r11, 508;
add.s32 r58, r8, r57;
barrier.sync 0;
and.b32 r59, r6, 3072;
add.s32 r60, r58, r59;
st.shared.f32 [r60], f128;
st.shared.f32 [r60+512], f138;
barrier.sync 0;
and.b32 r61, r11, 1536;
sub.s32 r62, r60, r61;
ld.shared.f32 f141, [r62];
ld.shared.f32 f142, [r62+2048];
barrier.sync 0;
st.shared.f32 [r60], f129;
st.shared.f32 [r60+512], f140;
barrier.sync 0;
ld.shared.f32 f143, [r62];
ld.shared.f32 f144, [r62+2048];
add.f32 f145, f141, f142;
add.f32 f146, f143, f144;
sub.f32 f147, f141, f142;
sub.f32 f148, f143, f144;
bfe.u32 r63, r5, 8, 1;
mul.wide.u32 rd27, r63, 8;
mov.u64 rd28, %13;
add.s64 rd29, rd28, rd27;
ld.global.v2.f32 {f149, f150}, [rd29];
mul.f32 f153, f149, f147;
mul.f32 f154, f150, f148;
sub.f32 f155, f153, f154;
mul.f32 f156, f149, f148;
fma.rn.f32 f157, f150, f147, f156;
and.b32 r64, r11, 1020;
add.s32 r65, r8, r64;
barrier.sync 0;
and.b32 r66, r6, 2048;
add.s32 r67, r65, r66;
st.shared.f32 [r67], f145;
st.shared.f32 [r67+1024], f155;
barrier.sync 0;
and.b32 r68, r11, 1024;
sub.s32 r69, r67, r68;
ld.shared.f32 f158, [r69];
ld.shared.f32 f159, [r69+2048];
barrier.sync 0;
st.shared.f32 [r67], f146;
st.shared.f32 [r67+1024], f157;
barrier.sync 0;
ld.shared.f32 f160, [r69];
ld.shared.f32 f161, [r69+2048];
add.f32 %0, f158, f159;
add.f32 %1, f160, f161;
sub.f32 %2, f158, f159;
sub.f32 %3, f160, f161;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_1024), "l"(lut_sp_2_512), "l"(lut_sp_2_256), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y));
};


#endif
