#ifndef CUFFTDX_FFT_128_FP32_FWD_PTX_HPP
#define CUFFTDX_FFT_128_FP32_FWD_PTX_HPP



template<> __forceinline__ __device__ void cufftdx_private_function<51, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<535>;
.reg .b32 r<14>;
.reg .b64 rd<6>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 9;
mov.u32 r3, %32;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f32 f65, %34, %55;
add.f32 f66, %35, %57;
sub.f32 f67, %34, %55;
sub.f32 f68, %35, %57;
add.f32 f69, %44, %66;
add.f32 f70, %46, %67;
sub.f32 f71, %44, %66;
sub.f32 f72, %46, %67;
add.f32 f73, f65, f69;
add.f32 f74, f66, f70;
sub.f32 f75, f65, f69;
sub.f32 f76, f66, f70;
add.f32 f77, f67, f72;
sub.f32 f78, f68, f71;
sub.f32 f79, f67, f72;
add.f32 f80, f68, f71;
add.f32 f81, %39, %60;
add.f32 f82, %41, %62;
sub.f32 f83, %39, %60;
sub.f32 f84, %41, %62;
add.f32 f85, %50, %71;
add.f32 f86, %51, %73;
sub.f32 f87, %50, %71;
sub.f32 f88, %51, %73;
add.f32 f89, f81, f85;
add.f32 f90, f82, f86;
sub.f32 f91, f81, f85;
sub.f32 f92, f82, f86;
add.f32 f93, f83, f88;
sub.f32 f94, f84, f87;
sub.f32 f95, f83, f88;
add.f32 f96, f84, f87;
mul.f32 f97, f93, 0f3F3504F3;
mul.f32 f98, f94, 0fBF3504F3;
sub.f32 f99, f97, f98;
mul.f32 f100, f94, 0f3F3504F3;
fma.rn.f32 f101, f93, 0fBF3504F3, f100;
mul.f32 f102, f95, 0fBF3504F3;
mul.f32 f103, f96, 0fBF3504F3;
sub.f32 f104, f102, f103;
add.f32 f105, f102, f103;
add.f32 f106, f73, f89;
add.f32 f107, f74, f90;
sub.f32 f108, f73, f89;
sub.f32 f109, f74, f90;
add.f32 f110, f77, f99;
add.f32 f111, f78, f101;
sub.f32 f112, f77, f99;
sub.f32 f113, f78, f101;
add.f32 f114, f75, f92;
sub.f32 f115, f76, f91;
sub.f32 f116, f75, f92;
add.f32 f117, f76, f91;
add.f32 f118, f79, f104;
add.f32 f119, f80, f105;
sub.f32 f120, f79, f104;
sub.f32 f121, f80, f105;
add.f32 f122, %36, %58;
add.f32 f123, %38, %59;
sub.f32 f124, %36, %58;
sub.f32 f125, %38, %59;
add.f32 f126, %47, %68;
add.f32 f127, %49, %70;
sub.f32 f128, %47, %68;
sub.f32 f129, %49, %70;
add.f32 f130, f122, f126;
add.f32 f131, f123, f127;
sub.f32 f132, f122, f126;
sub.f32 f133, f123, f127;
add.f32 f134, f124, f129;
sub.f32 f135, f125, f128;
sub.f32 f136, f124, f129;
add.f32 f137, f125, f128;
add.f32 f138, %42, %63;
add.f32 f139, %43, %65;
sub.f32 f140, %42, %63;
sub.f32 f141, %43, %65;
add.f32 f142, %52, %74;
add.f32 f143, %54, %75;
sub.f32 f144, %52, %74;
sub.f32 f145, %54, %75;
add.f32 f146, f138, f142;
add.f32 f147, f139, f143;
sub.f32 f148, f138, f142;
sub.f32 f149, f139, f143;
add.f32 f150, f140, f145;
sub.f32 f151, f141, f144;
sub.f32 f152, f140, f145;
add.f32 f153, f141, f144;
mul.f32 f154, f150, 0f3F3504F3;
mul.f32 f155, f151, 0fBF3504F3;
sub.f32 f156, f154, f155;
mul.f32 f157, f151, 0f3F3504F3;
fma.rn.f32 f158, f150, 0fBF3504F3, f157;
mul.f32 f159, f152, 0fBF3504F3;
mul.f32 f160, f153, 0fBF3504F3;
sub.f32 f161, f159, f160;
add.f32 f162, f159, f160;
add.f32 f163, f130, f146;
add.f32 f164, f131, f147;
sub.f32 f165, f130, f146;
sub.f32 f166, f131, f147;
add.f32 f167, f134, f156;
add.f32 f168, f135, f158;
sub.f32 f169, f134, f156;
sub.f32 f170, f135, f158;
add.f32 f171, f132, f149;
sub.f32 f172, f133, f148;
sub.f32 f173, f132, f149;
add.f32 f174, f133, f148;
add.f32 f175, f136, f161;
add.f32 f176, f137, f162;
sub.f32 f177, f136, f161;
sub.f32 f178, f137, f162;
mul.f32 f179, f167, 0f3F6C835E;
mul.f32 f180, f168, 0fBEC3EF15;
sub.f32 f181, f179, f180;
mul.f32 f182, f168, 0f3F6C835E;
fma.rn.f32 f183, f167, 0fBEC3EF15, f182;
mul.f32 f184, f171, 0f3F3504F3;
mul.f32 f185, f172, 0fBF3504F3;
sub.f32 f186, f184, f185;
mul.f32 f187, f172, 0f3F3504F3;
fma.rn.f32 f188, f171, 0fBF3504F3, f187;
mul.f32 f189, f175, 0f3EC3EF15;
mul.f32 f190, f176, 0fBF6C835E;
sub.f32 f191, f189, f190;
mul.f32 f192, f176, 0f3EC3EF15;
fma.rn.f32 f193, f175, 0fBF6C835E, f192;
mul.f32 f194, f169, 0fBEC3EF15;
mul.f32 f195, f170, 0fBF6C835E;
sub.f32 f196, f194, f195;
mul.f32 f197, f170, 0fBEC3EF15;
fma.rn.f32 f198, f169, 0fBF6C835E, f197;
mul.f32 f199, f173, 0fBF3504F3;
mul.f32 f200, f174, 0fBF3504F3;
sub.f32 f201, f199, f200;
add.f32 f202, f199, f200;
mul.f32 f203, f177, 0fBF6C835E;
mul.f32 f204, f178, 0fBEC3EF15;
sub.f32 f205, f203, f204;
mul.f32 f206, f178, 0fBF6C835E;
fma.rn.f32 f207, f177, 0fBEC3EF15, f206;
add.f32 f208, f106, f163;
add.f32 f209, f107, f164;
sub.f32 f210, f106, f163;
sub.f32 f211, f107, f164;
add.f32 f212, f110, f181;
add.f32 f213, f111, f183;
sub.f32 f214, f110, f181;
sub.f32 f215, f111, f183;
add.f32 f216, f114, f186;
add.f32 f217, f115, f188;
sub.f32 f218, f114, f186;
sub.f32 f219, f115, f188;
add.f32 f220, f118, f191;
add.f32 f221, f119, f193;
sub.f32 f222, f118, f191;
sub.f32 f223, f119, f193;
add.f32 f224, f108, f166;
sub.f32 f225, f109, f165;
sub.f32 f226, f108, f166;
add.f32 f227, f109, f165;
add.f32 f228, f112, f196;
add.f32 f229, f113, f198;
sub.f32 f230, f112, f196;
sub.f32 f231, f113, f198;
add.f32 f232, f116, f201;
add.f32 f233, f117, f202;
sub.f32 f234, f116, f201;
sub.f32 f235, f117, f202;
add.f32 f236, f120, f205;
add.f32 f237, f121, f207;
sub.f32 f238, f120, f205;
sub.f32 f239, f121, f207;
and.b32 r6, r5, 7;
shl.b32 r7, r5, 3;
cvt.u64.u32 rd2, r7;
and.b64 rd3, rd2, 56;
mov.u64 rd4, %33;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f240, f241}, [rd5];
mul.f32 f244, f240, f212;
mul.f32 f245, f241, f213;
sub.f32 f246, f244, f245;
mul.f32 f247, f240, f213;
fma.rn.f32 f248, f241, f212, f247;
mul.f32 f249, f240, f240;
mul.f32 f250, f241, f241;
sub.f32 f251, f249, f250;
mul.f32 f252, f241, f240;
fma.rn.f32 f253, f241, f240, f252;
mul.f32 f254, f251, f216;
mul.f32 f255, f253, f217;
sub.f32 f256, f254, f255;
mul.f32 f257, f251, f217;
fma.rn.f32 f258, f253, f216, f257;
mul.f32 f259, f240, f251;
mul.f32 f260, f241, f253;
sub.f32 f261, f259, f260;
mul.f32 f262, f240, f253;
fma.rn.f32 f263, f241, f251, f262;
mul.f32 f264, f261, f220;
mul.f32 f265, f263, f221;
sub.f32 f266, f264, f265;
mul.f32 f267, f261, f221;
fma.rn.f32 f268, f263, f220, f267;
mul.f32 f269, f240, f261;
mul.f32 f270, f241, f263;
sub.f32 f271, f269, f270;
mul.f32 f272, f240, f263;
fma.rn.f32 f273, f241, f261, f272;
mul.f32 f274, f271, f224;
mul.f32 f275, f273, f225;
sub.f32 f276, f274, f275;
mul.f32 f277, f271, f225;
fma.rn.f32 f278, f273, f224, f277;
mul.f32 f279, f240, f271;
mul.f32 f280, f241, f273;
sub.f32 f281, f279, f280;
mul.f32 f282, f240, f273;
fma.rn.f32 f283, f241, f271, f282;
mul.f32 f284, f281, f228;
mul.f32 f285, f283, f229;
sub.f32 f286, f284, f285;
mul.f32 f287, f281, f229;
fma.rn.f32 f288, f283, f228, f287;
mul.f32 f289, f240, f281;
mul.f32 f290, f241, f283;
sub.f32 f291, f289, f290;
mul.f32 f292, f240, f283;
fma.rn.f32 f293, f241, f281, f292;
mul.f32 f294, f291, f232;
mul.f32 f295, f293, f233;
sub.f32 f296, f294, f295;
mul.f32 f297, f291, f233;
fma.rn.f32 f298, f293, f232, f297;
mul.f32 f299, f240, f291;
mul.f32 f300, f241, f293;
sub.f32 f301, f299, f300;
mul.f32 f302, f240, f293;
fma.rn.f32 f303, f241, f291, f302;
mul.f32 f304, f301, f236;
mul.f32 f305, f303, f237;
sub.f32 f306, f304, f305;
mul.f32 f307, f301, f237;
fma.rn.f32 f308, f303, f236, f307;
mul.f32 f309, f240, f301;
mul.f32 f310, f241, f303;
sub.f32 f311, f309, f310;
mul.f32 f312, f240, f303;
fma.rn.f32 f313, f241, f301, f312;
mul.f32 f314, f311, f210;
mul.f32 f315, f313, f211;
sub.f32 f316, f314, f315;
mul.f32 f317, f311, f211;
fma.rn.f32 f318, f313, f210, f317;
mul.f32 f319, f240, f311;
mul.f32 f320, f241, f313;
sub.f32 f321, f319, f320;
mul.f32 f322, f240, f313;
fma.rn.f32 f323, f241, f311, f322;
mul.f32 f324, f321, f214;
mul.f32 f325, f323, f215;
sub.f32 f326, f324, f325;
mul.f32 f327, f321, f215;
fma.rn.f32 f328, f323, f214, f327;
mul.f32 f329, f240, f321;
mul.f32 f330, f241, f323;
sub.f32 f331, f329, f330;
mul.f32 f332, f240, f323;
fma.rn.f32 f333, f241, f321, f332;
mul.f32 f334, f331, f218;
mul.f32 f335, f333, f219;
sub.f32 f336, f334, f335;
mul.f32 f337, f331, f219;
fma.rn.f32 f338, f333, f218, f337;
mul.f32 f339, f240, f331;
mul.f32 f340, f241, f333;
sub.f32 f341, f339, f340;
mul.f32 f342, f240, f333;
fma.rn.f32 f343, f241, f331, f342;
mul.f32 f344, f341, f222;
mul.f32 f345, f343, f223;
sub.f32 f346, f344, f345;
mul.f32 f347, f341, f223;
fma.rn.f32 f348, f343, f222, f347;
mul.f32 f349, f240, f341;
mul.f32 f350, f241, f343;
sub.f32 f351, f349, f350;
mul.f32 f352, f240, f343;
fma.rn.f32 f353, f241, f341, f352;
mul.f32 f354, f351, f226;
mul.f32 f355, f353, f227;
sub.f32 f356, f354, f355;
mul.f32 f357, f351, f227;
fma.rn.f32 f358, f353, f226, f357;
mul.f32 f359, f240, f351;
mul.f32 f360, f241, f353;
sub.f32 f361, f359, f360;
mul.f32 f362, f240, f353;
fma.rn.f32 f363, f241, f351, f362;
mul.f32 f364, f361, f230;
mul.f32 f365, f363, f231;
sub.f32 f366, f364, f365;
mul.f32 f367, f361, f231;
fma.rn.f32 f368, f363, f230, f367;
mul.f32 f369, f240, f361;
mul.f32 f370, f241, f363;
sub.f32 f371, f369, f370;
mul.f32 f372, f240, f363;
fma.rn.f32 f373, f241, f361, f372;
mul.f32 f374, f371, f234;
mul.f32 f375, f373, f235;
sub.f32 f376, f374, f375;
mul.f32 f377, f371, f235;
fma.rn.f32 f378, f373, f234, f377;
mul.f32 f379, f240, f371;
mul.f32 f380, f241, f373;
sub.f32 f381, f379, f380;
mul.f32 f382, f240, f373;
fma.rn.f32 f383, f241, f371, f382;
mul.f32 f384, f381, f238;
mul.f32 f385, f383, f239;
sub.f32 f386, f384, f385;
mul.f32 f387, f381, f239;
fma.rn.f32 f388, f383, f238, f387;
shl.b32 r8, r5, 6;
and.b32 r9, r8, -512;
add.s32 r10, r4, r9;
barrier.sync 0;
and.b32 r11, r8, 448;
add.s32 r12, r10, r11;
st.shared.v4.f32 [r12], {f208, f246, f256, f266};
st.shared.v4.f32 [r12+16], {f276, f286, f296, f306};
st.shared.v4.f32 [r12+32], {f316, f326, f336, f346};
st.shared.v4.f32 [r12+48], {f356, f366, f376, f386};
barrier.sync 0;
mad.lo.s32 r13, r6, -60, r12;
ld.shared.f32 f389, [r13];
ld.shared.f32 f390, [r13+32];
ld.shared.f32 f391, [r13+64];
ld.shared.f32 f392, [r13+96];
ld.shared.f32 f393, [r13+128];
ld.shared.f32 f394, [r13+160];
ld.shared.f32 f395, [r13+192];
ld.shared.f32 f396, [r13+224];
ld.shared.f32 f397, [r13+256];
ld.shared.f32 f398, [r13+288];
ld.shared.f32 f399, [r13+320];
ld.shared.f32 f400, [r13+352];
ld.shared.f32 f401, [r13+384];
ld.shared.f32 f402, [r13+416];
ld.shared.f32 f403, [r13+448];
ld.shared.f32 f404, [r13+480];
barrier.sync 0;
st.shared.v4.f32 [r12], {f209, f248, f258, f268};
st.shared.v4.f32 [r12+16], {f278, f288, f298, f308};
st.shared.v4.f32 [r12+32], {f318, f328, f338, f348};
st.shared.v4.f32 [r12+48], {f358, f368, f378, f388};
barrier.sync 0;
ld.shared.f32 f405, [r13];
ld.shared.f32 f406, [r13+32];
ld.shared.f32 f407, [r13+64];
ld.shared.f32 f408, [r13+96];
ld.shared.f32 f409, [r13+128];
ld.shared.f32 f410, [r13+160];
ld.shared.f32 f411, [r13+192];
ld.shared.f32 f412, [r13+224];
ld.shared.f32 f413, [r13+256];
ld.shared.f32 f414, [r13+288];
ld.shared.f32 f415, [r13+320];
ld.shared.f32 f416, [r13+352];
ld.shared.f32 f417, [r13+384];
ld.shared.f32 f418, [r13+416];
ld.shared.f32 f419, [r13+448];
ld.shared.f32 f420, [r13+480];
add.f32 f421, f389, f397;
add.f32 f422, f405, f413;
sub.f32 f423, f389, f397;
sub.f32 f424, f405, f413;
add.f32 f425, f393, f401;
add.f32 f426, f409, f417;
sub.f32 f427, f393, f401;
sub.f32 f428, f409, f417;
add.f32 f429, f421, f425;
add.f32 f430, f422, f426;
sub.f32 f431, f421, f425;
sub.f32 f432, f422, f426;
add.f32 f433, f423, f428;
sub.f32 f434, f424, f427;
sub.f32 f435, f423, f428;
add.f32 f436, f424, f427;
add.f32 f437, f391, f399;
add.f32 f438, f407, f415;
sub.f32 f439, f391, f399;
sub.f32 f440, f407, f415;
add.f32 f441, f395, f403;
add.f32 f442, f411, f419;
sub.f32 f443, f395, f403;
sub.f32 f444, f411, f419;
add.f32 f445, f437, f441;
add.f32 f446, f438, f442;
sub.f32 f447, f437, f441;
sub.f32 f448, f438, f442;
add.f32 f449, f439, f444;
sub.f32 f450, f440, f443;
sub.f32 f451, f439, f444;
add.f32 f452, f440, f443;
mul.f32 f453, f449, 0f3F3504F3;
mul.f32 f454, f450, 0fBF3504F3;
sub.f32 f455, f453, f454;
mul.f32 f456, f450, 0f3F3504F3;
fma.rn.f32 f457, f449, 0fBF3504F3, f456;
mul.f32 f458, f451, 0fBF3504F3;
mul.f32 f459, f452, 0fBF3504F3;
sub.f32 f460, f458, f459;
add.f32 f461, f458, f459;
add.f32 f462, f390, f398;
add.f32 f463, f406, f414;
sub.f32 f464, f390, f398;
sub.f32 f465, f406, f414;
add.f32 f466, f394, f402;
add.f32 f467, f410, f418;
sub.f32 f468, f394, f402;
sub.f32 f469, f410, f418;
add.f32 f470, f462, f466;
add.f32 f471, f463, f467;
sub.f32 f472, f462, f466;
sub.f32 f473, f463, f467;
add.f32 f474, f464, f469;
sub.f32 f475, f465, f468;
sub.f32 f476, f464, f469;
add.f32 f477, f465, f468;
add.f32 f478, f392, f400;
add.f32 f479, f408, f416;
sub.f32 f480, f392, f400;
sub.f32 f481, f408, f416;
add.f32 f482, f396, f404;
add.f32 f483, f412, f420;
sub.f32 f484, f396, f404;
sub.f32 f485, f412, f420;
add.f32 f486, f478, f482;
add.f32 f487, f479, f483;
sub.f32 f488, f478, f482;
sub.f32 f489, f479, f483;
add.f32 f490, f480, f485;
sub.f32 f491, f481, f484;
sub.f32 f492, f480, f485;
add.f32 f493, f481, f484;
mul.f32 f494, f490, 0f3F3504F3;
mul.f32 f495, f491, 0fBF3504F3;
sub.f32 f496, f494, f495;
mul.f32 f497, f491, 0f3F3504F3;
fma.rn.f32 f498, f490, 0fBF3504F3, f497;
mul.f32 f499, f492, 0fBF3504F3;
mul.f32 f500, f493, 0fBF3504F3;
sub.f32 f501, f499, f500;
add.f32 f502, f499, f500;
add.f32 %0, f429, f445;
add.f32 %1, f430, f446;
add.f32 %2, f470, f486;
add.f32 %3, f471, f487;
add.f32 %5, f434, f457;
add.f32 %4, f433, f455;
add.f32 %7, f475, f498;
add.f32 %6, f474, f496;
sub.f32 %9, f432, f447;
add.f32 %8, f431, f448;
sub.f32 %11, f473, f488;
add.f32 %10, f472, f489;
add.f32 %13, f436, f461;
add.f32 %12, f435, f460;
add.f32 %15, f477, f502;
add.f32 %14, f476, f501;
sub.f32 %16, f429, f445;
sub.f32 %17, f430, f446;
sub.f32 %18, f470, f486;
sub.f32 %19, f471, f487;
sub.f32 %21, f434, f457;
sub.f32 %20, f433, f455;
sub.f32 %23, f475, f498;
sub.f32 %22, f474, f496;
add.f32 %25, f432, f447;
sub.f32 %24, f431, f448;
add.f32 %27, f473, f488;
sub.f32 %26, f472, f489;
sub.f32 %29, f436, f461;
sub.f32 %28, f435, f460;
sub.f32 %31, f477, f502;
sub.f32 %30, f476, f501;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_128), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<52, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<365>;
.reg .b32 r<20>;
.reg .b64 rd<9>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 10;
mov.u32 r3, %16;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f32 f33, %19, %29;
add.f32 f34, %20, %31;
sub.f32 f35, %19, %29;
sub.f32 f36, %20, %31;
add.f32 f37, %24, %35;
add.f32 f38, %26, %36;
sub.f32 f39, %24, %35;
sub.f32 f40, %26, %36;
add.f32 f41, f33, f37;
add.f32 f42, f34, f38;
sub.f32 f43, f33, f37;
sub.f32 f44, f34, f38;
add.f32 f45, f35, f40;
sub.f32 f46, f36, f39;
sub.f32 f47, f35, f40;
add.f32 f48, f36, f39;
add.f32 f49, %21, %32;
add.f32 f50, %23, %34;
sub.f32 f51, %21, %32;
sub.f32 f52, %23, %34;
add.f32 f53, %27, %37;
add.f32 f54, %28, %38;
sub.f32 f55, %27, %37;
sub.f32 f56, %28, %38;
add.f32 f57, f49, f53;
add.f32 f58, f50, f54;
sub.f32 f59, f49, f53;
sub.f32 f60, f50, f54;
add.f32 f61, f51, f56;
sub.f32 f62, f52, f55;
sub.f32 f63, f51, f56;
add.f32 f64, f52, f55;
mul.f32 f65, f61, 0f3F3504F3;
mul.f32 f66, f62, 0fBF3504F3;
sub.f32 f67, f65, f66;
mul.f32 f68, f62, 0f3F3504F3;
fma.rn.f32 f69, f61, 0fBF3504F3, f68;
mul.f32 f70, f63, 0fBF3504F3;
mul.f32 f71, f64, 0fBF3504F3;
sub.f32 f72, f70, f71;
add.f32 f73, f70, f71;
sub.f32 f74, f41, f57;
sub.f32 f75, f42, f58;
add.f32 f76, f45, f67;
add.f32 f77, f46, f69;
sub.f32 f78, f45, f67;
sub.f32 f79, f46, f69;
add.f32 f80, f43, f60;
sub.f32 f81, f44, f59;
sub.f32 f82, f43, f60;
add.f32 f83, f44, f59;
add.f32 f84, f47, f72;
add.f32 f85, f48, f73;
sub.f32 f86, f47, f72;
sub.f32 f87, f48, f73;
and.b32 r6, r5, 15;
shl.b32 r7, r5, 6;
and.b32 r8, r7, -1024;
add.s32 r9, r4, r8;
shl.b32 r10, r5, 3;
cvt.u64.u32 rd2, r10;
and.b64 rd3, rd2, 120;
mov.u64 rd4, %17;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f88, f89}, [rd5];
mul.f32 f92, f88, f76;
mul.f32 f93, f89, f77;
mul.f32 f94, f88, f77;
mul.f32 f95, f88, f88;
mul.f32 f96, f89, f89;
sub.f32 f97, f95, f96;
mul.f32 f98, f89, f88;
fma.rn.f32 f99, f89, f88, f98;
mul.f32 f100, f97, f80;
mul.f32 f101, f99, f81;
mul.f32 f102, f97, f81;
mul.f32 f103, f88, f97;
mul.f32 f104, f89, f99;
sub.f32 f105, f103, f104;
mul.f32 f106, f88, f99;
fma.rn.f32 f107, f89, f97, f106;
mul.f32 f108, f105, f84;
mul.f32 f109, f107, f85;
mul.f32 f110, f105, f85;
mul.f32 f111, f88, f105;
mul.f32 f112, f89, f107;
sub.f32 f113, f111, f112;
mul.f32 f114, f88, f107;
fma.rn.f32 f115, f89, f105, f114;
mul.f32 f116, f113, f74;
mul.f32 f117, f115, f75;
mul.f32 f118, f113, f75;
mul.f32 f119, f88, f113;
mul.f32 f120, f89, f115;
sub.f32 f121, f119, f120;
mul.f32 f122, f88, f115;
fma.rn.f32 f123, f89, f113, f122;
mul.f32 f124, f121, f78;
mul.f32 f125, f123, f79;
mul.f32 f126, f121, f79;
mul.f32 f127, f88, f121;
mul.f32 f128, f89, f123;
sub.f32 f129, f127, f128;
mul.f32 f130, f88, f123;
fma.rn.f32 f131, f89, f121, f130;
mul.f32 f132, f129, f82;
mul.f32 f133, f131, f83;
mul.f32 f134, f129, f83;
mul.f32 f135, f88, f129;
mul.f32 f136, f89, f131;
sub.f32 f137, f135, f136;
mul.f32 f138, f88, f131;
fma.rn.f32 f139, f89, f129, f138;
mul.f32 f140, f137, f86;
mul.f32 f141, f139, f87;
mul.f32 f142, f137, f87;
barrier.sync 0;
and.b32 r11, r7, 960;
add.s32 r12, r9, r11;
add.f32 f143, f42, f58;
add.f32 f144, f41, f57;
fma.rn.f32 f145, f89, f76, f94;
sub.f32 f146, f92, f93;
st.shared.v4.f32 [r12], {f144, f143, f146, f145};
fma.rn.f32 f147, f99, f80, f102;
sub.f32 f148, f100, f101;
sub.f32 f149, f108, f109;
fma.rn.f32 f150, f107, f84, f110;
st.shared.v4.f32 [r12+16], {f148, f147, f149, f150};
fma.rn.f32 f151, f115, f74, f118;
sub.f32 f152, f116, f117;
fma.rn.f32 f153, f123, f78, f126;
sub.f32 f154, f124, f125;
st.shared.v4.f32 [r12+32], {f152, f151, f154, f153};
fma.rn.f32 f155, f131, f82, f134;
sub.f32 f156, f132, f133;
fma.rn.f32 f157, f139, f86, f142;
sub.f32 f158, f140, f141;
st.shared.v4.f32 [r12+48], {f156, f155, f158, f157};
barrier.sync 0;
mad.lo.s32 r13, r6, -56, r12;
ld.shared.v2.f32 {f159, f160}, [r13];
ld.shared.v2.f32 {f163, f164}, [r13+128];
ld.shared.v2.f32 {f167, f168}, [r13+256];
ld.shared.v2.f32 {f171, f172}, [r13+384];
ld.shared.v2.f32 {f175, f176}, [r13+512];
ld.shared.v2.f32 {f179, f180}, [r13+640];
ld.shared.v2.f32 {f183, f184}, [r13+768];
ld.shared.v2.f32 {f187, f188}, [r13+896];
add.f32 f191, f159, f175;
add.f32 f192, f160, f176;
sub.f32 f193, f159, f175;
sub.f32 f194, f160, f176;
add.f32 f195, f167, f183;
add.f32 f196, f168, f184;
sub.f32 f197, f167, f183;
sub.f32 f198, f168, f184;
add.f32 f199, f191, f195;
add.f32 f200, f192, f196;
sub.f32 f201, f191, f195;
sub.f32 f202, f192, f196;
add.f32 f203, f193, f198;
sub.f32 f204, f194, f197;
sub.f32 f205, f193, f198;
add.f32 f206, f194, f197;
add.f32 f207, f163, f179;
add.f32 f208, f164, f180;
sub.f32 f209, f163, f179;
sub.f32 f210, f164, f180;
add.f32 f211, f171, f187;
add.f32 f212, f172, f188;
sub.f32 f213, f171, f187;
sub.f32 f214, f172, f188;
add.f32 f215, f207, f211;
add.f32 f216, f208, f212;
sub.f32 f217, f207, f211;
sub.f32 f218, f208, f212;
add.f32 f219, f209, f214;
sub.f32 f220, f210, f213;
sub.f32 f221, f209, f214;
add.f32 f222, f210, f213;
mul.f32 f223, f219, 0f3F3504F3;
mul.f32 f224, f220, 0fBF3504F3;
sub.f32 f225, f223, f224;
mul.f32 f226, f220, 0f3F3504F3;
fma.rn.f32 f227, f219, 0fBF3504F3, f226;
mul.f32 f228, f221, 0fBF3504F3;
mul.f32 f229, f222, 0fBF3504F3;
sub.f32 f230, f228, f229;
add.f32 f231, f228, f229;
sub.f32 f232, f199, f215;
sub.f32 f233, f200, f216;
add.f32 f234, f203, f225;
add.f32 f235, f204, f227;
sub.f32 f236, f203, f225;
sub.f32 f237, f204, f227;
add.f32 f238, f201, f218;
sub.f32 f239, f202, f217;
sub.f32 f240, f201, f218;
add.f32 f241, f202, f217;
add.f32 f242, f205, f230;
add.f32 f243, f206, f231;
sub.f32 f244, f205, f230;
sub.f32 f245, f206, f231;
and.b32 r14, r5, 8;
cvt.u64.u32 rd6, r14;
mov.u64 rd7, %18;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f246, f247}, [rd8];
mul.f32 f250, f246, f234;
mul.f32 f251, f247, f235;
mul.f32 f252, f246, f235;
mul.f32 f253, f246, f246;
mul.f32 f254, f247, f247;
sub.f32 f255, f253, f254;
mul.f32 f256, f247, f246;
fma.rn.f32 f257, f247, f246, f256;
mul.f32 f258, f255, f238;
mul.f32 f259, f257, f239;
mul.f32 f260, f255, f239;
mul.f32 f261, f246, f255;
mul.f32 f262, f247, f257;
sub.f32 f263, f261, f262;
mul.f32 f264, f246, f257;
fma.rn.f32 f265, f247, f255, f264;
mul.f32 f266, f263, f242;
mul.f32 f267, f265, f243;
mul.f32 f268, f263, f243;
mul.f32 f269, f246, f263;
mul.f32 f270, f247, f265;
sub.f32 f271, f269, f270;
mul.f32 f272, f246, f265;
fma.rn.f32 f273, f247, f263, f272;
mul.f32 f274, f271, f232;
mul.f32 f275, f273, f233;
mul.f32 f276, f271, f233;
mul.f32 f277, f246, f271;
mul.f32 f278, f247, f273;
sub.f32 f279, f277, f278;
mul.f32 f280, f246, f273;
fma.rn.f32 f281, f247, f271, f280;
mul.f32 f282, f279, f236;
mul.f32 f283, f281, f237;
mul.f32 f284, f279, f237;
mul.f32 f285, f246, f279;
mul.f32 f286, f247, f281;
sub.f32 f287, f285, f286;
mul.f32 f288, f246, f281;
fma.rn.f32 f289, f247, f279, f288;
mul.f32 f290, f287, f240;
mul.f32 f291, f289, f241;
mul.f32 f292, f287, f241;
mul.f32 f293, f246, f287;
mul.f32 f294, f247, f289;
sub.f32 f295, f293, f294;
mul.f32 f296, f246, f289;
fma.rn.f32 f297, f247, f287, f296;
mul.f32 f298, f295, f244;
mul.f32 f299, f297, f245;
mul.f32 f300, f295, f245;
and.b32 r15, r10, 56;
add.s32 r16, r9, r15;
barrier.sync 0;
and.b32 r17, r7, 512;
add.s32 r18, r16, r17;
add.f32 f301, f200, f216;
add.f32 f302, f199, f215;
st.shared.v2.f32 [r18], {f302, f301};
fma.rn.f32 f303, f247, f234, f252;
sub.f32 f304, f250, f251;
st.shared.v2.f32 [r18+64], {f304, f303};
fma.rn.f32 f305, f257, f238, f260;
sub.f32 f306, f258, f259;
st.shared.v2.f32 [r18+128], {f306, f305};
fma.rn.f32 f307, f265, f242, f268;
sub.f32 f308, f266, f267;
st.shared.v2.f32 [r18+192], {f308, f307};
sub.f32 f309, f274, f275;
fma.rn.f32 f310, f273, f232, f276;
st.shared.v2.f32 [r18+256], {f309, f310};
fma.rn.f32 f311, f281, f236, f284;
sub.f32 f312, f282, f283;
st.shared.v2.f32 [r18+320], {f312, f311};
fma.rn.f32 f313, f289, f240, f292;
sub.f32 f314, f290, f291;
st.shared.v2.f32 [r18+384], {f314, f313};
fma.rn.f32 f315, f297, f244, f300;
sub.f32 f316, f298, f299;
st.shared.v2.f32 [r18+448], {f316, f315};
barrier.sync 0;
mad.lo.s32 r19, r14, -56, r18;
ld.shared.v2.f32 {f317, f318}, [r19];
ld.shared.v2.f32 {f321, f322}, [r19+128];
ld.shared.v2.f32 {f325, f326}, [r19+256];
ld.shared.v2.f32 {f329, f330}, [r19+384];
ld.shared.v2.f32 {f333, f334}, [r19+512];
ld.shared.v2.f32 {f337, f338}, [r19+640];
ld.shared.v2.f32 {f341, f342}, [r19+768];
ld.shared.v2.f32 {f345, f346}, [r19+896];
add.f32 %1, f318, f334;
add.f32 %0, f317, f333;
add.f32 %3, f322, f338;
add.f32 %2, f321, f337;
add.f32 %5, f326, f342;
add.f32 %4, f325, f341;
add.f32 %7, f330, f346;
add.f32 %6, f329, f345;
sub.f32 %9, f318, f334;
sub.f32 %8, f317, f333;
sub.f32 %11, f322, f338;
sub.f32 %10, f321, f337;
sub.f32 %13, f326, f342;
sub.f32 %12, f325, f341;
sub.f32 %15, f330, f346;
sub.f32 %14, f329, f345;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_128), "l"(lut_sp_8_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<53, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<692>;
.reg .b32 r<22>;
.reg .b64 rd<9>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 10;
mov.u32 r3, %32;
add.s32 r4, r3, r2;
add.f32 f65, %34, %50;
sub.f32 f67, %34, %50;
add.f32 f684, %35, %66;
sub.f32 f68, %35, %66;
add.f32 f69, %42, %58;
sub.f32 f71, %42, %58;
add.f32 f682, %67, %59;
sub.f32 f72, %67, %59;
add.f32 f73, f65, f69;
sub.f32 f75, f65, f69;
add.f32 f681, f684, f682;
sub.f32 f76, f684, f682;
add.f32 f77, f67, f72;
sub.f32 f79, f67, f72;
sub.f32 f680, f68, f71;
add.f32 f80, f68, f71;
add.f32 f81, %38, %54;
sub.f32 f83, %38, %54;
add.f32 f677, %69, %68;
sub.f32 f84, %69, %68;
add.f32 f85, %46, %62;
sub.f32 f87, %46, %62;
add.f32 f675, %47, %70;
sub.f32 f88, %47, %70;
add.f32 f89, f81, f85;
sub.f32 f91, f81, f85;
add.f32 f674, f677, f675;
sub.f32 f92, f677, f675;
add.f32 f93, f83, f88;
sub.f32 f95, f83, f88;
sub.f32 f673, f84, f87;
add.f32 f96, f84, f87;
mul.f32 f98, f673, 0fBF3504F3;
mul.f32 f672, f93, 0f3F3504F3;
sub.f32 f99, f672, f98;
mul.f32 f100, f673, 0f3F3504F3;
fma.rn.f32 f101, f93, 0fBF3504F3, f100;
mul.f32 f102, f95, 0fBF3504F3;
mul.f32 f103, f96, 0fBF3504F3;
sub.f32 f104, f102, f103;
add.f32 f105, f102, f103;
add.f32 f106, f73, f89;
sub.f32 f108, f73, f89;
add.f32 f671, f681, f674;
sub.f32 f109, f681, f674;
add.f32 f110, f77, f99;
sub.f32 f112, f77, f99;
add.f32 f670, f680, f101;
sub.f32 f113, f680, f101;
add.f32 f114, f75, f92;
sub.f32 f116, f75, f92;
sub.f32 f669, f76, f91;
add.f32 f117, f76, f91;
add.f32 f118, f79, f104;
sub.f32 f120, f79, f104;
add.f32 f668, f80, f105;
sub.f32 f121, f80, f105;
add.f32 f122, %36, %52;
sub.f32 f124, %36, %52;
add.f32 f666, %71, %53;
sub.f32 f125, %71, %53;
add.f32 f126, %44, %60;
sub.f32 f128, %44, %60;
add.f32 f663, %72, %73;
sub.f32 f129, %72, %73;
add.f32 f130, f122, f126;
sub.f32 f132, f122, f126;
add.f32 f662, f666, f663;
sub.f32 f133, f666, f663;
add.f32 f134, f124, f129;
sub.f32 f136, f124, f129;
sub.f32 f661, f125, f128;
add.f32 f137, f125, f128;
add.f32 f138, %40, %56;
sub.f32 f140, %40, %56;
add.f32 f659, %41, %74;
sub.f32 f141, %41, %74;
add.f32 f142, %48, %64;
sub.f32 f144, %48, %64;
add.f32 f657, %75, %65;
sub.f32 f145, %75, %65;
add.f32 f146, f138, f142;
sub.f32 f148, f138, f142;
add.f32 f656, f659, f657;
sub.f32 f149, f659, f657;
add.f32 f150, f140, f145;
sub.f32 f152, f140, f145;
sub.f32 f655, f141, f144;
add.f32 f153, f141, f144;
mul.f32 f155, f655, 0fBF3504F3;
mul.f32 f654, f150, 0f3F3504F3;
sub.f32 f156, f654, f155;
mul.f32 f157, f655, 0f3F3504F3;
fma.rn.f32 f158, f150, 0fBF3504F3, f157;
mul.f32 f159, f152, 0fBF3504F3;
mul.f32 f160, f153, 0fBF3504F3;
sub.f32 f161, f159, f160;
add.f32 f162, f159, f160;
add.f32 f163, f130, f146;
sub.f32 f165, f130, f146;
add.f32 f653, f662, f656;
sub.f32 f166, f662, f656;
add.f32 f167, f134, f156;
sub.f32 f169, f134, f156;
add.f32 f652, f661, f158;
sub.f32 f170, f661, f158;
add.f32 f171, f132, f149;
sub.f32 f173, f132, f149;
sub.f32 f651, f133, f148;
add.f32 f174, f133, f148;
add.f32 f175, f136, f161;
sub.f32 f177, f136, f161;
add.f32 f650, f137, f162;
sub.f32 f178, f137, f162;
mul.f32 f648, f167, 0f3F6C835E;
mul.f32 f649, f652, 0fBEC3EF15;
sub.f32 f181, f648, f649;
mul.f32 f182, f652, 0f3F6C835E;
fma.rn.f32 f183, f167, 0fBEC3EF15, f182;
mul.f32 f646, f171, 0f3F3504F3;
mul.f32 f647, f651, 0fBF3504F3;
sub.f32 f186, f646, f647;
mul.f32 f187, f651, 0f3F3504F3;
fma.rn.f32 f188, f171, 0fBF3504F3, f187;
mul.f32 f644, f175, 0f3EC3EF15;
mul.f32 f645, f650, 0fBF6C835E;
sub.f32 f191, f644, f645;
mul.f32 f192, f650, 0f3EC3EF15;
fma.rn.f32 f193, f175, 0fBF6C835E, f192;
mul.f32 f642, f169, 0fBEC3EF15;
mul.f32 f643, f170, 0fBF6C835E;
sub.f32 f196, f642, f643;
mul.f32 f197, f170, 0fBEC3EF15;
fma.rn.f32 f198, f169, 0fBF6C835E, f197;
mul.f32 f199, f173, 0fBF3504F3;
mul.f32 f200, f174, 0fBF3504F3;
sub.f32 f201, f199, f200;
add.f32 f202, f199, f200;
mul.f32 f640, f177, 0fBF6C835E;
mul.f32 f641, f178, 0fBEC3EF15;
sub.f32 f205, f640, f641;
mul.f32 f206, f178, 0fBF6C835E;
fma.rn.f32 f207, f177, 0fBEC3EF15, f206;
add.f32 f210, f110, f181;
sub.f32 f212, f110, f181;
add.f32 f639, f670, f183;
sub.f32 f213, f670, f183;
add.f32 f214, f114, f186;
sub.f32 f216, f114, f186;
add.f32 f638, f669, f188;
sub.f32 f217, f669, f188;
add.f32 f218, f118, f191;
sub.f32 f220, f118, f191;
add.f32 f637, f668, f193;
sub.f32 f221, f668, f193;
add.f32 f222, f108, f166;
sub.f32 f224, f108, f166;
sub.f32 f636, f109, f165;
add.f32 f225, f109, f165;
add.f32 f226, f112, f196;
sub.f32 f228, f112, f196;
add.f32 f635, f113, f198;
sub.f32 f229, f113, f198;
add.f32 f230, f116, f201;
sub.f32 f232, f116, f201;
add.f32 f634, f117, f202;
sub.f32 f233, f117, f202;
add.f32 f234, f120, f205;
sub.f32 f236, f120, f205;
add.f32 f633, f121, f207;
sub.f32 f237, f121, f207;
mov.u32 r15, %tid.x;
shl.b32 r7, r15, 7;
and.b32 r8, r7, -1024;
add.s32 r9, r4, r8;
and.b32 r14, r15, 7;
shl.b32 r10, r15, 3;
cvt.u64.u32 rd2, r10;
and.b64 rd3, rd2, 56;
mov.u64 rd4, %33;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f238, f239}, [rd5];
mul.f32 f243, f239, f639;
mul.f32 f244, f238, f639;
mul.f32 f246, f239, f239;
mul.f32 f632, f238, f238;
sub.f32 f247, f632, f246;
mul.f32 f248, f239, f238;
fma.rn.f32 f249, f239, f238, f248;
mul.f32 f251, f249, f638;
mul.f32 f252, f247, f638;
mul.f32 f630, f238, f247;
mul.f32 f631, f239, f249;
sub.f32 f255, f630, f631;
mul.f32 f629, f247, f214;
mul.f32 f256, f238, f249;
fma.rn.f32 f257, f239, f247, f256;
mul.f32 f259, f257, f637;
mul.f32 f260, f255, f637;
mul.f32 f262, f239, f257;
mul.f32 f628, f238, f255;
sub.f32 f263, f628, f262;
mul.f32 f627, f255, f218;
mul.f32 f264, f238, f257;
fma.rn.f32 f265, f239, f255, f264;
mul.f32 f267, f265, f636;
mul.f32 f268, f263, f636;
mul.f32 f270, f239, f265;
mul.f32 f626, f238, f263;
sub.f32 f271, f626, f270;
mul.f32 f625, f263, f222;
mul.f32 f272, f238, f265;
fma.rn.f32 f273, f239, f263, f272;
mul.f32 f275, f273, f635;
mul.f32 f276, f271, f635;
mul.f32 f623, f238, f271;
mul.f32 f624, f239, f273;
sub.f32 f279, f623, f624;
mul.f32 f622, f271, f226;
mul.f32 f280, f238, f273;
fma.rn.f32 f281, f239, f271, f280;
mul.f32 f283, f281, f634;
mul.f32 f284, f279, f634;
mul.f32 f286, f239, f281;
mul.f32 f621, f238, f279;
sub.f32 f287, f621, f286;
mul.f32 f620, f279, f230;
mul.f32 f288, f238, f281;
fma.rn.f32 f289, f239, f279, f288;
mul.f32 f291, f289, f633;
mul.f32 f292, f287, f633;
mul.f32 f294, f239, f289;
mul.f32 f619, f238, f287;
sub.f32 f295, f619, f294;
mul.f32 f618, f287, f234;
mul.f32 f296, f238, f289;
fma.rn.f32 f297, f239, f287, f296;
sub.f32 f617, f671, f653;
mul.f32 f299, f297, f617;
mul.f32 f300, f295, f617;
mul.f32 f615, f238, f295;
mul.f32 f616, f239, f297;
sub.f32 f303, f615, f616;
sub.f32 f614, f106, f163;
mul.f32 f613, f295, f614;
mul.f32 f304, f238, f297;
fma.rn.f32 f305, f239, f295, f304;
mul.f32 f307, f305, f213;
mul.f32 f308, f303, f213;
mul.f32 f310, f239, f305;
mul.f32 f612, f238, f303;
sub.f32 f311, f612, f310;
mul.f32 f611, f303, f212;
mul.f32 f312, f238, f305;
fma.rn.f32 f313, f239, f303, f312;
mul.f32 f315, f313, f217;
mul.f32 f316, f311, f217;
mul.f32 f609, f238, f311;
mul.f32 f610, f239, f313;
sub.f32 f319, f609, f610;
mul.f32 f608, f311, f216;
mul.f32 f320, f238, f313;
fma.rn.f32 f321, f239, f311, f320;
mul.f32 f323, f321, f221;
mul.f32 f324, f319, f221;
mul.f32 f326, f239, f321;
mul.f32 f607, f238, f319;
sub.f32 f327, f607, f326;
mul.f32 f606, f319, f220;
mul.f32 f328, f238, f321;
fma.rn.f32 f329, f239, f319, f328;
mul.f32 f331, f329, f225;
mul.f32 f332, f327, f225;
mul.f32 f334, f239, f329;
mul.f32 f605, f238, f327;
sub.f32 f335, f605, f334;
mul.f32 f604, f327, f224;
mul.f32 f336, f238, f329;
fma.rn.f32 f337, f239, f327, f336;
mul.f32 f339, f337, f229;
mul.f32 f340, f335, f229;
mul.f32 f602, f238, f335;
mul.f32 f603, f239, f337;
sub.f32 f343, f602, f603;
mul.f32 f601, f335, f228;
mul.f32 f344, f238, f337;
fma.rn.f32 f345, f239, f335, f344;
mul.f32 f347, f345, f233;
mul.f32 f348, f343, f233;
mul.f32 f350, f239, f345;
mul.f32 f600, f238, f343;
sub.f32 f351, f600, f350;
mul.f32 f599, f238, f210;
mul.f32 f352, f238, f345;
mul.f32 f598, f343, f232;
fma.rn.f32 f353, f239, f343, f352;
mul.f32 f354, f351, f236;
mul.f32 f355, f353, f237;
mul.f32 f356, f351, f237;
sub.f32 f689, f671, f653;
mul.f32 f688, f297, f689;
mov.u32 r21, %tid.x;
shl.b32 r20, r21, 7;
barrier.sync 0;
and.b32 r11, r20, 896;
add.s32 r12, r9, r11;
sub.f32 f691, f671, f653;
mul.f32 f690, f297, f691;
add.f32 f357, f671, f653;
sub.f32 f687, f106, f163;
add.f32 f358, f106, f163;
mov.u32 r17, %tid.x;
and.b32 r16, r17, 7;
mov.u32 r19, %tid.x;
and.b32 r18, r19, 7;
fma.rn.f32 f359, f239, f210, f244;
sub.f32 f360, f599, f243;
st.shared.v4.f32 [r12], {f358, f357, f360, f359};
fma.rn.f32 f361, f249, f214, f252;
sub.f32 f362, f629, f251;
fma.rn.f32 f363, f257, f218, f260;
sub.f32 f364, f627, f259;
st.shared.v4.f32 [r12+16], {f362, f361, f364, f363};
sub.f32 f365, f625, f267;
fma.rn.f32 f366, f265, f222, f268;
fma.rn.f32 f367, f273, f226, f276;
sub.f32 f368, f622, f275;
st.shared.v4.f32 [r12+32], {f365, f366, f368, f367};
fma.rn.f32 f369, f281, f230, f284;
sub.f32 f370, f620, f283;
fma.rn.f32 f371, f289, f234, f292;
sub.f32 f372, f618, f291;
st.shared.v4.f32 [r12+48], {f370, f369, f372, f371};
fma.rn.f32 f373, f297, f687, f300;
sub.f32 f374, f613, f690;
fma.rn.f32 f375, f305, f212, f308;
sub.f32 f376, f611, f307;
st.shared.v4.f32 [r12+64], {f374, f373, f376, f375};
fma.rn.f32 f377, f313, f216, f316;
sub.f32 f378, f608, f315;
fma.rn.f32 f379, f321, f220, f324;
sub.f32 f380, f606, f323;
st.shared.v4.f32 [r12+80], {f378, f377, f380, f379};
fma.rn.f32 f381, f329, f224, f332;
sub.f32 f382, f604, f331;
fma.rn.f32 f383, f337, f228, f340;
sub.f32 f384, f601, f339;
st.shared.v4.f32 [r12+96], {f382, f381, f384, f383};
fma.rn.f32 f385, f345, f232, f348;
sub.f32 f386, f598, f347;
fma.rn.f32 f387, f353, f236, f356;
sub.f32 f388, f354, f355;
st.shared.v4.f32 [r12+112], {f386, f385, f388, f387};
barrier.sync 0;
mad.lo.s32 r13, r18, -120, r12;
ld.shared.v2.f32 {f389, f390}, [r13];
ld.shared.v2.f32 {f393, f394}, [r13+64];
ld.shared.v2.f32 {f397, f398}, [r13+128];
ld.shared.v2.f32 {f401, f402}, [r13+192];
ld.shared.v2.f32 {f405, f406}, [r13+256];
ld.shared.v2.f32 {f409, f410}, [r13+320];
ld.shared.v2.f32 {f413, f414}, [r13+384];
ld.shared.v2.f32 {f417, f418}, [r13+448];
ld.shared.v2.f32 {f421, f422}, [r13+512];
ld.shared.v2.f32 {f425, f426}, [r13+576];
ld.shared.v2.f32 {f429, f430}, [r13+640];
ld.shared.v2.f32 {f433, f434}, [r13+704];
ld.shared.v2.f32 {f437, f438}, [r13+768];
ld.shared.v2.f32 {f441, f442}, [r13+832];
ld.shared.v2.f32 {f445, f446}, [r13+896];
ld.shared.v2.f32 {f449, f450}, [r13+960];
add.f32 f453, f389, f421;
sub.f32 f455, f389, f421;
add.f32 f597, f390, f422;
sub.f32 f456, f390, f422;
add.f32 f457, f405, f437;
sub.f32 f459, f405, f437;
add.f32 f596, f406, f438;
sub.f32 f460, f406, f438;
add.f32 f461, f453, f457;
sub.f32 f463, f453, f457;
add.f32 f595, f597, f596;
sub.f32 f464, f597, f596;
add.f32 f465, f455, f460;
sub.f32 f467, f455, f460;
sub.f32 f594, f456, f459;
add.f32 f468, f456, f459;
add.f32 f469, f397, f429;
sub.f32 f471, f397, f429;
add.f32 f593, f398, f430;
sub.f32 f472, f398, f430;
add.f32 f473, f413, f445;
sub.f32 f475, f413, f445;
add.f32 f592, f414, f446;
sub.f32 f476, f414, f446;
add.f32 f477, f469, f473;
sub.f32 f479, f469, f473;
add.f32 f591, f593, f592;
sub.f32 f480, f593, f592;
add.f32 f481, f471, f476;
sub.f32 f483, f471, f476;
sub.f32 f590, f472, f475;
add.f32 f484, f472, f475;
mul.f32 f588, f481, 0f3F3504F3;
mul.f32 f589, f590, 0fBF3504F3;
sub.f32 f487, f588, f589;
mul.f32 f488, f590, 0f3F3504F3;
fma.rn.f32 f489, f481, 0fBF3504F3, f488;
mul.f32 f490, f483, 0fBF3504F3;
mul.f32 f491, f484, 0fBF3504F3;
sub.f32 f492, f490, f491;
add.f32 f493, f490, f491;
add.f32 f494, f393, f425;
sub.f32 f496, f393, f425;
add.f32 f587, f394, f426;
sub.f32 f497, f394, f426;
add.f32 f498, f409, f441;
sub.f32 f500, f409, f441;
add.f32 f586, f410, f442;
sub.f32 f501, f410, f442;
add.f32 f502, f494, f498;
sub.f32 f504, f494, f498;
add.f32 f585, f587, f586;
sub.f32 f505, f587, f586;
add.f32 f506, f496, f501;
sub.f32 f508, f496, f501;
sub.f32 f584, f497, f500;
add.f32 f509, f497, f500;
add.f32 f510, f401, f433;
sub.f32 f512, f401, f433;
add.f32 f583, f402, f434;
sub.f32 f513, f402, f434;
add.f32 f514, f417, f449;
sub.f32 f516, f417, f449;
add.f32 f582, f418, f450;
sub.f32 f517, f418, f450;
add.f32 f518, f510, f514;
sub.f32 f520, f510, f514;
add.f32 f581, f583, f582;
sub.f32 f521, f583, f582;
add.f32 f522, f512, f517;
sub.f32 f524, f512, f517;
sub.f32 f580, f513, f516;
add.f32 f525, f513, f516;
mul.f32 f527, f580, 0fBF3504F3;
mul.f32 f579, f522, 0f3F3504F3;
sub.f32 f528, f579, f527;
mul.f32 f529, f580, 0f3F3504F3;
fma.rn.f32 f530, f522, 0fBF3504F3, f529;
mul.f32 f531, f524, 0fBF3504F3;
mul.f32 f532, f525, 0fBF3504F3;
sub.f32 f533, f531, f532;
add.f32 f534, f531, f532;
add.f32 %1, f595, f591;
add.f32 %0, f461, f477;
add.f32 %3, f585, f581;
add.f32 %2, f502, f518;
add.f32 %4, f465, f487;
add.f32 %5, f594, f489;
add.f32 %6, f506, f528;
add.f32 %7, f584, f530;
add.f32 %8, f463, f480;
sub.f32 %9, f464, f479;
add.f32 %10, f504, f521;
sub.f32 %11, f505, f520;
add.f32 %13, f468, f493;
add.f32 %12, f467, f492;
add.f32 %15, f509, f534;
add.f32 %14, f508, f533;
sub.f32 %17, f595, f591;
sub.f32 %16, f461, f477;
sub.f32 %19, f585, f581;
sub.f32 %18, f502, f518;
sub.f32 %21, f594, f489;
sub.f32 %20, f465, f487;
sub.f32 %23, f584, f530;
sub.f32 %22, f506, f528;
add.f32 %25, f464, f479;
sub.f32 %24, f463, f480;
add.f32 %27, f505, f520;
sub.f32 %26, f504, f521;
sub.f32 %29, f468, f493;
sub.f32 %28, f467, f492;
sub.f32 %31, f509, f534;
sub.f32 %30, f508, f533;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_128), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[10].y), "f"(rmem[2].y), "f"(rmem[14].y), "f"(rmem[1].y), "f"(rmem[5].y), "f"(rmem[13].y), "f"(rmem[11].y), "f"(rmem[7].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<54, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<333>;
.reg .b32 r<21>;
.reg .b64 rd<9>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 9;
mov.u32 r3, %16;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f32 f33, %19, %29;
add.f32 f34, %20, %31;
sub.f32 f35, %19, %29;
sub.f32 f36, %20, %31;
add.f32 f37, %24, %35;
add.f32 f38, %26, %36;
sub.f32 f39, %24, %35;
sub.f32 f40, %26, %36;
add.f32 f41, f33, f37;
add.f32 f42, f34, f38;
sub.f32 f43, f33, f37;
sub.f32 f44, f34, f38;
add.f32 f45, f35, f40;
sub.f32 f46, f36, f39;
sub.f32 f47, f35, f40;
add.f32 f48, f36, f39;
add.f32 f49, %21, %32;
add.f32 f50, %23, %34;
sub.f32 f51, %21, %32;
sub.f32 f52, %23, %34;
add.f32 f53, %27, %37;
add.f32 f54, %28, %38;
sub.f32 f55, %27, %37;
sub.f32 f56, %28, %38;
add.f32 f57, f49, f53;
add.f32 f58, f50, f54;
sub.f32 f59, f49, f53;
sub.f32 f60, f50, f54;
add.f32 f61, f51, f56;
sub.f32 f62, f52, f55;
sub.f32 f63, f51, f56;
add.f32 f64, f52, f55;
mul.f32 f65, f61, 0f3F3504F3;
mul.f32 f66, f62, 0fBF3504F3;
sub.f32 f67, f65, f66;
mul.f32 f68, f62, 0f3F3504F3;
fma.rn.f32 f69, f61, 0fBF3504F3, f68;
mul.f32 f70, f63, 0fBF3504F3;
mul.f32 f71, f64, 0fBF3504F3;
sub.f32 f72, f70, f71;
add.f32 f73, f70, f71;
add.f32 f74, f41, f57;
add.f32 f75, f42, f58;
sub.f32 f76, f41, f57;
sub.f32 f77, f42, f58;
add.f32 f78, f45, f67;
add.f32 f79, f46, f69;
sub.f32 f80, f45, f67;
sub.f32 f81, f46, f69;
add.f32 f82, f43, f60;
sub.f32 f83, f44, f59;
sub.f32 f84, f43, f60;
add.f32 f85, f44, f59;
add.f32 f86, f47, f72;
add.f32 f87, f48, f73;
sub.f32 f88, f47, f72;
sub.f32 f89, f48, f73;
and.b32 r6, r5, 15;
shl.b32 r7, r5, 3;
cvt.u64.u32 rd2, r7;
and.b64 rd3, rd2, 120;
mov.u64 rd4, %17;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f90, f91}, [rd5];
mul.f32 f94, f90, f78;
mul.f32 f95, f91, f79;
sub.f32 f96, f94, f95;
mul.f32 f97, f90, f79;
fma.rn.f32 f98, f91, f78, f97;
mul.f32 f99, f90, f90;
mul.f32 f100, f91, f91;
sub.f32 f101, f99, f100;
mul.f32 f102, f91, f90;
fma.rn.f32 f103, f91, f90, f102;
mul.f32 f104, f101, f82;
mul.f32 f105, f103, f83;
sub.f32 f106, f104, f105;
mul.f32 f107, f101, f83;
fma.rn.f32 f108, f103, f82, f107;
mul.f32 f109, f90, f101;
mul.f32 f110, f91, f103;
sub.f32 f111, f109, f110;
mul.f32 f112, f90, f103;
fma.rn.f32 f113, f91, f101, f112;
mul.f32 f114, f111, f86;
mul.f32 f115, f113, f87;
sub.f32 f116, f114, f115;
mul.f32 f117, f111, f87;
fma.rn.f32 f118, f113, f86, f117;
mul.f32 f119, f90, f111;
mul.f32 f120, f91, f113;
sub.f32 f121, f119, f120;
mul.f32 f122, f90, f113;
fma.rn.f32 f123, f91, f111, f122;
mul.f32 f124, f121, f76;
mul.f32 f125, f123, f77;
sub.f32 f126, f124, f125;
mul.f32 f127, f121, f77;
fma.rn.f32 f128, f123, f76, f127;
mul.f32 f129, f90, f121;
mul.f32 f130, f91, f123;
sub.f32 f131, f129, f130;
mul.f32 f132, f90, f123;
fma.rn.f32 f133, f91, f121, f132;
mul.f32 f134, f131, f80;
mul.f32 f135, f133, f81;
sub.f32 f136, f134, f135;
mul.f32 f137, f131, f81;
fma.rn.f32 f138, f133, f80, f137;
mul.f32 f139, f90, f131;
mul.f32 f140, f91, f133;
sub.f32 f141, f139, f140;
mul.f32 f142, f90, f133;
fma.rn.f32 f143, f91, f131, f142;
mul.f32 f144, f141, f84;
mul.f32 f145, f143, f85;
sub.f32 f146, f144, f145;
mul.f32 f147, f141, f85;
fma.rn.f32 f148, f143, f84, f147;
mul.f32 f149, f90, f141;
mul.f32 f150, f91, f143;
sub.f32 f151, f149, f150;
mul.f32 f152, f90, f143;
fma.rn.f32 f153, f91, f141, f152;
mul.f32 f154, f151, f88;
mul.f32 f155, f153, f89;
sub.f32 f156, f154, f155;
mul.f32 f157, f151, f89;
fma.rn.f32 f158, f153, f88, f157;
shl.b32 r8, r5, 5;
and.b32 r9, r8, -512;
add.s32 r10, r4, r9;
barrier.sync 0;
and.b32 r11, r8, 480;
add.s32 r12, r10, r11;
st.shared.v4.f32 [r12], {f74, f96, f106, f116};
st.shared.v4.f32 [r12+16], {f126, f136, f146, f156};
barrier.sync 0;
mad.lo.s32 r13, r6, -28, r12;
ld.shared.f32 f159, [r13];
ld.shared.f32 f160, [r13+64];
ld.shared.f32 f161, [r13+128];
ld.shared.f32 f162, [r13+192];
ld.shared.f32 f163, [r13+256];
ld.shared.f32 f164, [r13+320];
ld.shared.f32 f165, [r13+384];
ld.shared.f32 f166, [r13+448];
barrier.sync 0;
st.shared.v4.f32 [r12], {f75, f98, f108, f118};
st.shared.v4.f32 [r12+16], {f128, f138, f148, f158};
barrier.sync 0;
ld.shared.f32 f167, [r13];
ld.shared.f32 f168, [r13+64];
ld.shared.f32 f169, [r13+128];
ld.shared.f32 f170, [r13+192];
ld.shared.f32 f171, [r13+256];
ld.shared.f32 f172, [r13+320];
ld.shared.f32 f173, [r13+384];
ld.shared.f32 f174, [r13+448];
add.f32 f175, f159, f163;
add.f32 f176, f167, f171;
sub.f32 f177, f159, f163;
sub.f32 f178, f167, f171;
add.f32 f179, f161, f165;
add.f32 f180, f169, f173;
sub.f32 f181, f161, f165;
sub.f32 f182, f169, f173;
add.f32 f183, f175, f179;
add.f32 f184, f176, f180;
sub.f32 f185, f175, f179;
sub.f32 f186, f176, f180;
add.f32 f187, f177, f182;
sub.f32 f188, f178, f181;
sub.f32 f189, f177, f182;
add.f32 f190, f178, f181;
add.f32 f191, f160, f164;
add.f32 f192, f168, f172;
sub.f32 f193, f160, f164;
sub.f32 f194, f168, f172;
add.f32 f195, f162, f166;
add.f32 f196, f170, f174;
sub.f32 f197, f162, f166;
sub.f32 f198, f170, f174;
add.f32 f199, f191, f195;
add.f32 f200, f192, f196;
sub.f32 f201, f191, f195;
sub.f32 f202, f192, f196;
add.f32 f203, f193, f198;
sub.f32 f204, f194, f197;
sub.f32 f205, f193, f198;
add.f32 f206, f194, f197;
mul.f32 f207, f203, 0f3F3504F3;
mul.f32 f208, f204, 0fBF3504F3;
sub.f32 f209, f207, f208;
mul.f32 f210, f204, 0f3F3504F3;
fma.rn.f32 f211, f203, 0fBF3504F3, f210;
mul.f32 f212, f205, 0fBF3504F3;
mul.f32 f213, f206, 0fBF3504F3;
sub.f32 f214, f212, f213;
add.f32 f215, f212, f213;
add.f32 f216, f183, f199;
add.f32 f217, f184, f200;
sub.f32 f218, f183, f199;
sub.f32 f219, f184, f200;
add.f32 f220, f187, f209;
add.f32 f221, f188, f211;
sub.f32 f222, f187, f209;
sub.f32 f223, f188, f211;
add.f32 f224, f185, f202;
sub.f32 f225, f186, f201;
sub.f32 f226, f185, f202;
add.f32 f227, f186, f201;
add.f32 f228, f189, f214;
add.f32 f229, f190, f215;
sub.f32 f230, f189, f214;
sub.f32 f231, f190, f215;
and.b32 r14, r5, 8;
cvt.u64.u32 rd6, r14;
mov.u64 rd7, %18;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f232, f233}, [rd8];
mul.f32 f236, f232, f220;
mul.f32 f237, f233, f221;
sub.f32 f238, f236, f237;
mul.f32 f239, f232, f221;
fma.rn.f32 f240, f233, f220, f239;
mul.f32 f241, f232, f232;
mul.f32 f242, f233, f233;
sub.f32 f243, f241, f242;
mul.f32 f244, f233, f232;
fma.rn.f32 f245, f233, f232, f244;
mul.f32 f246, f243, f224;
mul.f32 f247, f245, f225;
sub.f32 f248, f246, f247;
mul.f32 f249, f243, f225;
fma.rn.f32 f250, f245, f224, f249;
mul.f32 f251, f232, f243;
mul.f32 f252, f233, f245;
sub.f32 f253, f251, f252;
mul.f32 f254, f232, f245;
fma.rn.f32 f255, f233, f243, f254;
mul.f32 f256, f253, f228;
mul.f32 f257, f255, f229;
sub.f32 f258, f256, f257;
mul.f32 f259, f253, f229;
fma.rn.f32 f260, f255, f228, f259;
mul.f32 f261, f232, f253;
mul.f32 f262, f233, f255;
sub.f32 f263, f261, f262;
mul.f32 f264, f232, f255;
fma.rn.f32 f265, f233, f253, f264;
mul.f32 f266, f263, f218;
mul.f32 f267, f265, f219;
sub.f32 f268, f266, f267;
mul.f32 f269, f263, f219;
fma.rn.f32 f270, f265, f218, f269;
mul.f32 f271, f232, f263;
mul.f32 f272, f233, f265;
sub.f32 f273, f271, f272;
mul.f32 f274, f232, f265;
fma.rn.f32 f275, f233, f263, f274;
mul.f32 f276, f273, f222;
mul.f32 f277, f275, f223;
sub.f32 f278, f276, f277;
mul.f32 f279, f273, f223;
fma.rn.f32 f280, f275, f222, f279;
mul.f32 f281, f232, f273;
mul.f32 f282, f233, f275;
sub.f32 f283, f281, f282;
mul.f32 f284, f232, f275;
fma.rn.f32 f285, f233, f273, f284;
mul.f32 f286, f283, f226;
mul.f32 f287, f285, f227;
sub.f32 f288, f286, f287;
mul.f32 f289, f283, f227;
fma.rn.f32 f290, f285, f226, f289;
mul.f32 f291, f232, f283;
mul.f32 f292, f233, f285;
sub.f32 f293, f291, f292;
mul.f32 f294, f232, f285;
fma.rn.f32 f295, f233, f283, f294;
mul.f32 f296, f293, f230;
mul.f32 f297, f295, f231;
sub.f32 f298, f296, f297;
mul.f32 f299, f293, f231;
fma.rn.f32 f300, f295, f230, f299;
shl.b32 r15, r5, 2;
and.b32 r16, r15, 28;
add.s32 r17, r10, r16;
barrier.sync 0;
and.b32 r18, r8, 256;
add.s32 r19, r17, r18;
st.shared.f32 [r19], f216;
st.shared.f32 [r19+32], f238;
st.shared.f32 [r19+64], f248;
st.shared.f32 [r19+96], f258;
st.shared.f32 [r19+128], f268;
st.shared.f32 [r19+160], f278;
st.shared.f32 [r19+192], f288;
st.shared.f32 [r19+224], f298;
barrier.sync 0;
mad.lo.s32 r20, r14, -28, r19;
ld.shared.f32 f301, [r20];
ld.shared.f32 f302, [r20+64];
ld.shared.f32 f303, [r20+128];
ld.shared.f32 f304, [r20+192];
ld.shared.f32 f305, [r20+256];
ld.shared.f32 f306, [r20+320];
ld.shared.f32 f307, [r20+384];
ld.shared.f32 f308, [r20+448];
barrier.sync 0;
st.shared.f32 [r19], f217;
st.shared.f32 [r19+32], f240;
st.shared.f32 [r19+64], f250;
st.shared.f32 [r19+96], f260;
st.shared.f32 [r19+128], f270;
st.shared.f32 [r19+160], f280;
st.shared.f32 [r19+192], f290;
st.shared.f32 [r19+224], f300;
barrier.sync 0;
ld.shared.f32 f309, [r20];
ld.shared.f32 f310, [r20+64];
ld.shared.f32 f311, [r20+128];
ld.shared.f32 f312, [r20+192];
ld.shared.f32 f313, [r20+256];
ld.shared.f32 f314, [r20+320];
ld.shared.f32 f315, [r20+384];
ld.shared.f32 f316, [r20+448];
add.f32 %0, f301, f305;
add.f32 %1, f309, f313;
add.f32 %2, f302, f306;
add.f32 %3, f310, f314;
add.f32 %4, f303, f307;
add.f32 %5, f311, f315;
add.f32 %6, f304, f308;
add.f32 %7, f312, f316;
sub.f32 %8, f301, f305;
sub.f32 %9, f309, f313;
sub.f32 %10, f302, f306;
sub.f32 %11, f310, f314;
sub.f32 %12, f303, f307;
sub.f32 %13, f311, f315;
sub.f32 %14, f304, f308;
sub.f32 %15, f312, f316;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_128), "l"(lut_sp_8_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<55, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<208>;
.reg .b32 r<28>;
.reg .b64 rd<12>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 10;
mov.u32 r3, %8;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f32 f17, %12, %17;
add.f32 f18, %13, %19;
sub.f32 f19, %12, %17;
sub.f32 f20, %13, %19;
add.f32 f21, %14, %20;
add.f32 f22, %16, %21;
sub.f32 f23, %14, %20;
sub.f32 f24, %16, %21;
sub.f32 f25, f17, f21;
sub.f32 f26, f18, f22;
add.f32 f27, f19, f24;
sub.f32 f28, f20, f23;
sub.f32 f29, f19, f24;
add.f32 f30, f20, f23;
and.b32 r6, r5, 31;
shl.b32 r7, r5, 5;
and.b32 r8, r7, -1024;
add.s32 r9, r4, r8;
shl.b32 r10, r5, 3;
cvt.u64.u32 rd2, r10;
and.b64 rd3, rd2, 248;
mov.u64 rd4, %9;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f31, f32}, [rd5];
mul.f32 f35, f31, f27;
mul.f32 f36, f32, f28;
mul.f32 f37, f31, f28;
mul.f32 f38, f31, f31;
mul.f32 f39, f32, f32;
sub.f32 f40, f38, f39;
mul.f32 f41, f32, f31;
fma.rn.f32 f42, f32, f31, f41;
mul.f32 f43, f40, f25;
mul.f32 f44, f42, f26;
mul.f32 f45, f40, f26;
mul.f32 f46, f31, f40;
mul.f32 f47, f32, f42;
sub.f32 f48, f46, f47;
mul.f32 f49, f31, f42;
fma.rn.f32 f50, f32, f40, f49;
mul.f32 f51, f48, f29;
mul.f32 f52, f50, f30;
mul.f32 f53, f48, f30;
barrier.sync 0;
and.b32 r11, r7, 992;
add.s32 r12, r9, r11;
add.f32 f54, f18, f22;
add.f32 f55, f17, f21;
fma.rn.f32 f56, f32, f27, f37;
sub.f32 f57, f35, f36;
st.shared.v4.f32 [r12], {f55, f54, f57, f56};
sub.f32 f58, f43, f44;
fma.rn.f32 f59, f42, f25, f45;
fma.rn.f32 f60, f50, f29, f53;
sub.f32 f61, f51, f52;
st.shared.v4.f32 [r12+16], {f58, f59, f61, f60};
barrier.sync 0;
mad.lo.s32 r13, r6, -24, r12;
ld.shared.v2.f32 {f62, f63}, [r13];
ld.shared.v2.f32 {f66, f67}, [r13+256];
ld.shared.v2.f32 {f70, f71}, [r13+512];
ld.shared.v2.f32 {f74, f75}, [r13+768];
add.f32 f78, f62, f70;
add.f32 f79, f63, f71;
sub.f32 f80, f62, f70;
sub.f32 f81, f63, f71;
add.f32 f82, f66, f74;
add.f32 f83, f67, f75;
sub.f32 f84, f66, f74;
sub.f32 f85, f67, f75;
sub.f32 f86, f78, f82;
sub.f32 f87, f79, f83;
add.f32 f88, f80, f85;
sub.f32 f89, f81, f84;
sub.f32 f90, f80, f85;
add.f32 f91, f81, f84;
and.b32 r14, r5, 28;
bfe.u32 r15, r5, 2, 3;
mul.wide.u32 rd6, r15, 8;
mov.u64 rd7, %10;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f92, f93}, [rd8];
mul.f32 f96, f92, f88;
mul.f32 f97, f93, f89;
mul.f32 f98, f92, f89;
mul.f32 f99, f92, f92;
mul.f32 f100, f93, f93;
sub.f32 f101, f99, f100;
mul.f32 f102, f93, f92;
fma.rn.f32 f103, f93, f92, f102;
mul.f32 f104, f101, f86;
mul.f32 f105, f103, f87;
mul.f32 f106, f101, f87;
mul.f32 f107, f92, f101;
mul.f32 f108, f93, f103;
sub.f32 f109, f107, f108;
mul.f32 f110, f92, f103;
fma.rn.f32 f111, f93, f101, f110;
mul.f32 f112, f109, f90;
mul.f32 f113, f111, f91;
mul.f32 f114, f109, f91;
and.b32 r16, r10, 24;
add.s32 r17, r9, r16;
barrier.sync 0;
and.b32 r18, r7, 896;
add.s32 r19, r17, r18;
add.f32 f115, f79, f83;
add.f32 f116, f78, f82;
st.shared.v2.f32 [r19], {f116, f115};
fma.rn.f32 f117, f93, f88, f98;
sub.f32 f118, f96, f97;
st.shared.v2.f32 [r19+32], {f118, f117};
fma.rn.f32 f119, f103, f86, f106;
sub.f32 f120, f104, f105;
st.shared.v2.f32 [r19+64], {f120, f119};
sub.f32 f121, f112, f113;
fma.rn.f32 f122, f111, f90, f114;
st.shared.v2.f32 [r19+96], {f121, f122};
barrier.sync 0;
mad.lo.s32 r20, r14, -24, r19;
ld.shared.v2.f32 {f123, f124}, [r20];
ld.shared.v2.f32 {f127, f128}, [r20+256];
ld.shared.v2.f32 {f131, f132}, [r20+512];
ld.shared.v2.f32 {f135, f136}, [r20+768];
add.f32 f139, f123, f131;
add.f32 f140, f124, f132;
sub.f32 f141, f123, f131;
sub.f32 f142, f124, f132;
add.f32 f143, f127, f135;
add.f32 f144, f128, f136;
sub.f32 f145, f127, f135;
sub.f32 f146, f128, f136;
sub.f32 f147, f139, f143;
sub.f32 f148, f140, f144;
add.f32 f149, f141, f146;
sub.f32 f150, f142, f145;
sub.f32 f151, f141, f146;
add.f32 f152, f142, f145;
and.b32 r21, r5, 16;
bfe.u32 r22, r5, 4, 1;
mul.wide.u32 rd9, r22, 8;
mov.u64 rd10, %11;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f153, f154}, [rd11];
mul.f32 f157, f153, f149;
mul.f32 f158, f154, f150;
mul.f32 f159, f153, f150;
mul.f32 f160, f153, f153;
mul.f32 f161, f154, f154;
sub.f32 f162, f160, f161;
mul.f32 f163, f154, f153;
fma.rn.f32 f164, f154, f153, f163;
mul.f32 f165, f162, f147;
mul.f32 f166, f164, f148;
mul.f32 f167, f162, f148;
mul.f32 f168, f153, f162;
mul.f32 f169, f154, f164;
sub.f32 f170, f168, f169;
mul.f32 f171, f153, f164;
fma.rn.f32 f172, f154, f162, f171;
mul.f32 f173, f170, f151;
mul.f32 f174, f172, f152;
mul.f32 f175, f170, f152;
and.b32 r23, r10, 120;
add.s32 r24, r9, r23;
barrier.sync 0;
and.b32 r25, r7, 512;
add.s32 r26, r24, r25;
add.f32 f176, f140, f144;
add.f32 f177, f139, f143;
st.shared.v2.f32 [r26], {f177, f176};
fma.rn.f32 f178, f154, f149, f159;
sub.f32 f179, f157, f158;
st.shared.v2.f32 [r26+128], {f179, f178};
fma.rn.f32 f180, f164, f147, f167;
sub.f32 f181, f165, f166;
st.shared.v2.f32 [r26+256], {f181, f180};
sub.f32 f182, f173, f174;
fma.rn.f32 f183, f172, f151, f175;
st.shared.v2.f32 [r26+384], {f182, f183};
barrier.sync 0;
mad.lo.s32 r27, r21, -24, r26;
ld.shared.v2.f32 {f184, f185}, [r27];
ld.shared.v2.f32 {f188, f189}, [r27+256];
ld.shared.v2.f32 {f192, f193}, [r27+512];
ld.shared.v2.f32 {f196, f197}, [r27+768];
add.f32 %1, f185, f193;
add.f32 %0, f184, f192;
add.f32 %3, f189, f197;
add.f32 %2, f188, f196;
sub.f32 %5, f185, f193;
sub.f32 %4, f184, f192;
sub.f32 %7, f189, f197;
sub.f32 %6, f188, f196;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_128), "l"(lut_sp_4_32), "l"(lut_sp_4_8), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<56, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<184>;
.reg .b32 r<29>;
.reg .b64 rd<12>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 9;
mov.u32 r3, %8;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f32 f17, %12, %17;
add.f32 f18, %13, %19;
sub.f32 f19, %12, %17;
sub.f32 f20, %13, %19;
add.f32 f21, %14, %20;
add.f32 f22, %16, %21;
sub.f32 f23, %14, %20;
sub.f32 f24, %16, %21;
add.f32 f25, f17, f21;
add.f32 f26, f18, f22;
sub.f32 f27, f17, f21;
sub.f32 f28, f18, f22;
add.f32 f29, f19, f24;
sub.f32 f30, f20, f23;
sub.f32 f31, f19, f24;
add.f32 f32, f20, f23;
and.b32 r6, r5, 31;
shl.b32 r7, r5, 3;
cvt.u64.u32 rd2, r7;
and.b64 rd3, rd2, 248;
mov.u64 rd4, %9;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f33, f34}, [rd5];
mul.f32 f37, f33, f29;
mul.f32 f38, f34, f30;
sub.f32 f39, f37, f38;
mul.f32 f40, f33, f30;
fma.rn.f32 f41, f34, f29, f40;
mul.f32 f42, f33, f33;
mul.f32 f43, f34, f34;
sub.f32 f44, f42, f43;
mul.f32 f45, f34, f33;
fma.rn.f32 f46, f34, f33, f45;
mul.f32 f47, f44, f27;
mul.f32 f48, f46, f28;
sub.f32 f49, f47, f48;
mul.f32 f50, f44, f28;
fma.rn.f32 f51, f46, f27, f50;
mul.f32 f52, f33, f44;
mul.f32 f53, f34, f46;
sub.f32 f54, f52, f53;
mul.f32 f55, f33, f46;
fma.rn.f32 f56, f34, f44, f55;
mul.f32 f57, f54, f31;
mul.f32 f58, f56, f32;
sub.f32 f59, f57, f58;
mul.f32 f60, f54, f32;
fma.rn.f32 f61, f56, f31, f60;
shl.b32 r8, r5, 4;
and.b32 r9, r8, -512;
add.s32 r10, r4, r9;
barrier.sync 0;
and.b32 r11, r8, 496;
add.s32 r12, r10, r11;
st.shared.v4.f32 [r12], {f25, f39, f49, f59};
barrier.sync 0;
mad.lo.s32 r13, r6, -12, r12;
ld.shared.f32 f62, [r13];
ld.shared.f32 f63, [r13+128];
ld.shared.f32 f64, [r13+256];
ld.shared.f32 f65, [r13+384];
barrier.sync 0;
st.shared.v4.f32 [r12], {f26, f41, f51, f61};
barrier.sync 0;
ld.shared.f32 f66, [r13];
ld.shared.f32 f67, [r13+128];
ld.shared.f32 f68, [r13+256];
ld.shared.f32 f69, [r13+384];
add.f32 f70, f62, f64;
add.f32 f71, f66, f68;
sub.f32 f72, f62, f64;
sub.f32 f73, f66, f68;
add.f32 f74, f63, f65;
add.f32 f75, f67, f69;
sub.f32 f76, f63, f65;
sub.f32 f77, f67, f69;
add.f32 f78, f70, f74;
add.f32 f79, f71, f75;
sub.f32 f80, f70, f74;
sub.f32 f81, f71, f75;
add.f32 f82, f72, f77;
sub.f32 f83, f73, f76;
sub.f32 f84, f72, f77;
add.f32 f85, f73, f76;
and.b32 r14, r5, 28;
bfe.u32 r15, r5, 2, 3;
mul.wide.u32 rd6, r15, 8;
mov.u64 rd7, %10;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f86, f87}, [rd8];
mul.f32 f90, f86, f82;
mul.f32 f91, f87, f83;
sub.f32 f92, f90, f91;
mul.f32 f93, f86, f83;
fma.rn.f32 f94, f87, f82, f93;
mul.f32 f95, f86, f86;
mul.f32 f96, f87, f87;
sub.f32 f97, f95, f96;
mul.f32 f98, f87, f86;
fma.rn.f32 f99, f87, f86, f98;
mul.f32 f100, f97, f80;
mul.f32 f101, f99, f81;
sub.f32 f102, f100, f101;
mul.f32 f103, f97, f81;
fma.rn.f32 f104, f99, f80, f103;
mul.f32 f105, f86, f97;
mul.f32 f106, f87, f99;
sub.f32 f107, f105, f106;
mul.f32 f108, f86, f99;
fma.rn.f32 f109, f87, f97, f108;
mul.f32 f110, f107, f84;
mul.f32 f111, f109, f85;
sub.f32 f112, f110, f111;
mul.f32 f113, f107, f85;
fma.rn.f32 f114, f109, f84, f113;
shl.b32 r16, r5, 2;
and.b32 r17, r16, 12;
add.s32 r18, r10, r17;
barrier.sync 0;
and.b32 r19, r8, 448;
add.s32 r20, r18, r19;
st.shared.f32 [r20], f78;
st.shared.f32 [r20+16], f92;
st.shared.f32 [r20+32], f102;
st.shared.f32 [r20+48], f112;
barrier.sync 0;
mad.lo.s32 r21, r14, -12, r20;
ld.shared.f32 f115, [r21];
ld.shared.f32 f116, [r21+128];
ld.shared.f32 f117, [r21+256];
ld.shared.f32 f118, [r21+384];
barrier.sync 0;
st.shared.f32 [r20], f79;
st.shared.f32 [r20+16], f94;
st.shared.f32 [r20+32], f104;
st.shared.f32 [r20+48], f114;
barrier.sync 0;
ld.shared.f32 f119, [r21];
ld.shared.f32 f120, [r21+128];
ld.shared.f32 f121, [r21+256];
ld.shared.f32 f122, [r21+384];
add.f32 f123, f115, f117;
add.f32 f124, f119, f121;
sub.f32 f125, f115, f117;
sub.f32 f126, f119, f121;
add.f32 f127, f116, f118;
add.f32 f128, f120, f122;
sub.f32 f129, f116, f118;
sub.f32 f130, f120, f122;
add.f32 f131, f123, f127;
add.f32 f132, f124, f128;
sub.f32 f133, f123, f127;
sub.f32 f134, f124, f128;
add.f32 f135, f125, f130;
sub.f32 f136, f126, f129;
sub.f32 f137, f125, f130;
add.f32 f138, f126, f129;
and.b32 r22, r5, 16;
bfe.u32 r23, r5, 4, 1;
mul.wide.u32 rd9, r23, 8;
mov.u64 rd10, %11;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f139, f140}, [rd11];
mul.f32 f143, f139, f135;
mul.f32 f144, f140, f136;
sub.f32 f145, f143, f144;
mul.f32 f146, f139, f136;
fma.rn.f32 f147, f140, f135, f146;
mul.f32 f148, f139, f139;
mul.f32 f149, f140, f140;
sub.f32 f150, f148, f149;
mul.f32 f151, f140, f139;
fma.rn.f32 f152, f140, f139, f151;
mul.f32 f153, f150, f133;
mul.f32 f154, f152, f134;
sub.f32 f155, f153, f154;
mul.f32 f156, f150, f134;
fma.rn.f32 f157, f152, f133, f156;
mul.f32 f158, f139, f150;
mul.f32 f159, f140, f152;
sub.f32 f160, f158, f159;
mul.f32 f161, f139, f152;
fma.rn.f32 f162, f140, f150, f161;
mul.f32 f163, f160, f137;
mul.f32 f164, f162, f138;
sub.f32 f165, f163, f164;
mul.f32 f166, f160, f138;
fma.rn.f32 f167, f162, f137, f166;
and.b32 r24, r16, 60;
add.s32 r25, r10, r24;
barrier.sync 0;
and.b32 r26, r8, 256;
add.s32 r27, r25, r26;
st.shared.f32 [r27], f131;
st.shared.f32 [r27+64], f145;
st.shared.f32 [r27+128], f155;
st.shared.f32 [r27+192], f165;
barrier.sync 0;
mad.lo.s32 r28, r22, -12, r27;
ld.shared.f32 f168, [r28];
ld.shared.f32 f169, [r28+128];
ld.shared.f32 f170, [r28+256];
ld.shared.f32 f171, [r28+384];
barrier.sync 0;
st.shared.f32 [r27], f132;
st.shared.f32 [r27+64], f147;
st.shared.f32 [r27+128], f157;
st.shared.f32 [r27+192], f167;
barrier.sync 0;
ld.shared.f32 f172, [r28];
ld.shared.f32 f173, [r28+128];
ld.shared.f32 f174, [r28+256];
ld.shared.f32 f175, [r28+384];
add.f32 %0, f168, f170;
add.f32 %1, f172, f174;
add.f32 %2, f169, f171;
add.f32 %3, f173, f175;
sub.f32 %4, f168, f170;
sub.f32 %5, f172, f174;
sub.f32 %6, f169, f171;
sub.f32 %7, f173, f175;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_128), "l"(lut_sp_4_32), "l"(lut_sp_4_8), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<57, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<1432>;
.reg .b32 r<22>;
.reg .b64 rd<9>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 10;
mov.u32 r3, %64;
add.s32 r4, r3, r2;
add.f32 f129, %66, %98;
sub.f32 f131, %66, %98;
add.f32 f1427, %67, %130;
sub.f32 f132, %67, %130;
add.f32 f133, %82, %114;
sub.f32 f135, %82, %114;
add.f32 f1425, %131, %115;
sub.f32 f136, %131, %115;
add.f32 f137, f129, f133;
sub.f32 f139, f129, f133;
add.f32 f1424, f1427, f1425;
sub.f32 f140, f1427, f1425;
add.f32 f141, f131, f136;
sub.f32 f143, f131, f136;
sub.f32 f1423, f132, f135;
add.f32 f144, f132, f135;
add.f32 f145, %74, %106;
sub.f32 f147, %74, %106;
add.f32 f1420, %133, %132;
sub.f32 f148, %133, %132;
add.f32 f149, %90, %122;
sub.f32 f151, %90, %122;
add.f32 f1418, %91, %134;
sub.f32 f152, %91, %134;
add.f32 f153, f145, f149;
sub.f32 f155, f145, f149;
add.f32 f1417, f1420, f1418;
sub.f32 f156, f1420, f1418;
add.f32 f157, f147, f152;
sub.f32 f159, f147, f152;
sub.f32 f1416, f148, f151;
add.f32 f160, f148, f151;
mul.f32 f162, f1416, 0fBF3504F3;
mul.f32 f1415, f157, 0f3F3504F3;
sub.f32 f163, f1415, f162;
mul.f32 f164, f1416, 0f3F3504F3;
fma.rn.f32 f165, f157, 0fBF3504F3, f164;
mul.f32 f166, f159, 0fBF3504F3;
mul.f32 f167, f160, 0fBF3504F3;
sub.f32 f168, f166, f167;
add.f32 f169, f166, f167;
add.f32 f170, f137, f153;
sub.f32 f172, f137, f153;
add.f32 f1414, f1424, f1417;
sub.f32 f173, f1424, f1417;
add.f32 f174, f141, f163;
sub.f32 f176, f141, f163;
add.f32 f1413, f1423, f165;
sub.f32 f177, f1423, f165;
add.f32 f178, f139, f156;
sub.f32 f180, f139, f156;
sub.f32 f1412, f140, f155;
add.f32 f181, f140, f155;
add.f32 f182, f143, f168;
sub.f32 f184, f143, f168;
add.f32 f1411, f144, f169;
sub.f32 f185, f144, f169;
add.f32 f186, %70, %102;
sub.f32 f188, %70, %102;
add.f32 f1409, %135, %103;
sub.f32 f189, %135, %103;
add.f32 f190, %86, %118;
sub.f32 f192, %86, %118;
add.f32 f1406, %137, %136;
sub.f32 f193, %137, %136;
add.f32 f194, f186, f190;
sub.f32 f196, f186, f190;
add.f32 f1405, f1409, f1406;
sub.f32 f197, f1409, f1406;
add.f32 f198, f188, f193;
sub.f32 f200, f188, f193;
sub.f32 f1404, f189, f192;
add.f32 f201, f189, f192;
add.f32 f202, %78, %110;
sub.f32 f204, %78, %110;
add.f32 f1402, %79, %138;
sub.f32 f205, %79, %138;
add.f32 f206, %94, %126;
sub.f32 f208, %94, %126;
add.f32 f1400, %139, %127;
sub.f32 f209, %139, %127;
add.f32 f210, f202, f206;
sub.f32 f212, f202, f206;
add.f32 f1399, f1402, f1400;
sub.f32 f213, f1402, f1400;
add.f32 f214, f204, f209;
sub.f32 f216, f204, f209;
sub.f32 f1398, f205, f208;
add.f32 f217, f205, f208;
mul.f32 f219, f1398, 0fBF3504F3;
mul.f32 f1397, f214, 0f3F3504F3;
sub.f32 f220, f1397, f219;
mul.f32 f221, f1398, 0f3F3504F3;
fma.rn.f32 f222, f214, 0fBF3504F3, f221;
mul.f32 f223, f216, 0fBF3504F3;
mul.f32 f224, f217, 0fBF3504F3;
sub.f32 f225, f223, f224;
add.f32 f226, f223, f224;
add.f32 f227, f194, f210;
sub.f32 f229, f194, f210;
add.f32 f1396, f1405, f1399;
sub.f32 f230, f1405, f1399;
add.f32 f231, f198, f220;
sub.f32 f233, f198, f220;
add.f32 f1395, f1404, f222;
sub.f32 f234, f1404, f222;
add.f32 f235, f196, f213;
sub.f32 f237, f196, f213;
sub.f32 f1394, f197, f212;
add.f32 f238, f197, f212;
add.f32 f239, f200, f225;
sub.f32 f241, f200, f225;
add.f32 f1393, f201, f226;
sub.f32 f242, f201, f226;
mul.f32 f1391, f231, 0f3F6C835E;
mul.f32 f1392, f1395, 0fBEC3EF15;
sub.f32 f245, f1391, f1392;
mul.f32 f246, f1395, 0f3F6C835E;
fma.rn.f32 f247, f231, 0fBEC3EF15, f246;
mul.f32 f1389, f235, 0f3F3504F3;
mul.f32 f1390, f1394, 0fBF3504F3;
sub.f32 f250, f1389, f1390;
mul.f32 f251, f1394, 0f3F3504F3;
fma.rn.f32 f252, f235, 0fBF3504F3, f251;
mul.f32 f1387, f239, 0f3EC3EF15;
mul.f32 f1388, f1393, 0fBF6C835E;
sub.f32 f255, f1387, f1388;
mul.f32 f256, f1393, 0f3EC3EF15;
fma.rn.f32 f257, f239, 0fBF6C835E, f256;
mul.f32 f1385, f233, 0fBEC3EF15;
mul.f32 f1386, f234, 0fBF6C835E;
sub.f32 f260, f1385, f1386;
mul.f32 f261, f234, 0fBEC3EF15;
fma.rn.f32 f262, f233, 0fBF6C835E, f261;
mul.f32 f263, f237, 0fBF3504F3;
mul.f32 f264, f238, 0fBF3504F3;
sub.f32 f265, f263, f264;
add.f32 f266, f263, f264;
mul.f32 f1383, f241, 0fBF6C835E;
mul.f32 f1384, f242, 0fBEC3EF15;
sub.f32 f269, f1383, f1384;
mul.f32 f270, f242, 0fBF6C835E;
fma.rn.f32 f271, f241, 0fBEC3EF15, f270;
add.f32 f272, f170, f227;
sub.f32 f274, f170, f227;
add.f32 f1382, f1414, f1396;
sub.f32 f275, f1414, f1396;
add.f32 f276, f174, f245;
sub.f32 f278, f174, f245;
add.f32 f1381, f1413, f247;
sub.f32 f279, f1413, f247;
add.f32 f280, f178, f250;
sub.f32 f282, f178, f250;
add.f32 f1380, f1412, f252;
sub.f32 f283, f1412, f252;
add.f32 f284, f182, f255;
sub.f32 f286, f182, f255;
add.f32 f1379, f1411, f257;
sub.f32 f287, f1411, f257;
add.f32 f288, f172, f230;
sub.f32 f290, f172, f230;
sub.f32 f1378, f173, f229;
add.f32 f291, f173, f229;
add.f32 f292, f176, f260;
sub.f32 f294, f176, f260;
add.f32 f1377, f177, f262;
sub.f32 f295, f177, f262;
add.f32 f296, f180, f265;
sub.f32 f298, f180, f265;
add.f32 f1376, f181, f266;
sub.f32 f299, f181, f266;
add.f32 f300, f184, f269;
sub.f32 f302, f184, f269;
add.f32 f1375, f185, f271;
sub.f32 f303, f185, f271;
add.f32 f304, %68, %100;
sub.f32 f306, %68, %100;
add.f32 f1372, %141, %140;
sub.f32 f307, %141, %140;
add.f32 f308, %84, %116;
sub.f32 f310, %84, %116;
add.f32 f1370, %85, %142;
sub.f32 f311, %85, %142;
add.f32 f312, f304, f308;
sub.f32 f314, f304, f308;
add.f32 f1369, f1372, f1370;
sub.f32 f315, f1372, f1370;
add.f32 f316, f306, f311;
sub.f32 f318, f306, f311;
sub.f32 f1368, f307, f310;
add.f32 f319, f307, f310;
add.f32 f320, %76, %108;
sub.f32 f322, %76, %108;
add.f32 f1366, %143, %109;
sub.f32 f323, %143, %109;
add.f32 f324, %92, %124;
sub.f32 f326, %92, %124;
add.f32 f1363, %145, %144;
sub.f32 f327, %145, %144;
add.f32 f328, f320, f324;
sub.f32 f330, f320, f324;
add.f32 f1362, f1366, f1363;
sub.f32 f331, f1366, f1363;
add.f32 f332, f322, f327;
sub.f32 f334, f322, f327;
sub.f32 f1361, f323, f326;
add.f32 f335, f323, f326;
mul.f32 f1359, f332, 0f3F3504F3;
mul.f32 f1360, f1361, 0fBF3504F3;
sub.f32 f338, f1359, f1360;
mul.f32 f339, f1361, 0f3F3504F3;
fma.rn.f32 f340, f332, 0fBF3504F3, f339;
mul.f32 f341, f334, 0fBF3504F3;
mul.f32 f342, f335, 0fBF3504F3;
sub.f32 f343, f341, f342;
add.f32 f344, f341, f342;
add.f32 f345, f312, f328;
sub.f32 f347, f312, f328;
add.f32 f1358, f1369, f1362;
sub.f32 f348, f1369, f1362;
add.f32 f349, f316, f338;
sub.f32 f351, f316, f338;
add.f32 f1357, f1368, f340;
sub.f32 f352, f1368, f340;
add.f32 f353, f314, f331;
sub.f32 f355, f314, f331;
sub.f32 f1356, f315, f330;
add.f32 f356, f315, f330;
add.f32 f357, f318, f343;
sub.f32 f359, f318, f343;
add.f32 f1355, f319, f344;
sub.f32 f360, f319, f344;
add.f32 f361, %72, %104;
sub.f32 f363, %72, %104;
add.f32 f1353, %73, %146;
sub.f32 f364, %73, %146;
add.f32 f365, %88, %120;
sub.f32 f367, %88, %120;
add.f32 f1351, %147, %121;
sub.f32 f368, %147, %121;
add.f32 f369, f361, f365;
sub.f32 f371, f361, f365;
add.f32 f1350, f1353, f1351;
sub.f32 f372, f1353, f1351;
add.f32 f373, f363, f368;
sub.f32 f375, f363, f368;
sub.f32 f1349, f364, f367;
add.f32 f376, f364, f367;
add.f32 f377, %80, %112;
sub.f32 f379, %80, %112;
add.f32 f1346, %149, %148;
sub.f32 f380, %149, %148;
add.f32 f381, %96, %128;
sub.f32 f383, %96, %128;
add.f32 f1345, %97, %129;
sub.f32 f384, %97, %129;
add.f32 f385, f377, f381;
sub.f32 f387, f377, f381;
add.f32 f1344, f1346, f1345;
sub.f32 f388, f1346, f1345;
add.f32 f389, f379, f384;
sub.f32 f391, f379, f384;
sub.f32 f1343, f380, f383;
add.f32 f392, f380, f383;
mul.f32 f1341, f389, 0f3F3504F3;
mul.f32 f1342, f1343, 0fBF3504F3;
sub.f32 f395, f1341, f1342;
mul.f32 f396, f1343, 0f3F3504F3;
fma.rn.f32 f397, f389, 0fBF3504F3, f396;
mul.f32 f398, f391, 0fBF3504F3;
mul.f32 f399, f392, 0fBF3504F3;
sub.f32 f400, f398, f399;
add.f32 f401, f398, f399;
add.f32 f402, f369, f385;
sub.f32 f404, f369, f385;
add.f32 f1340, f1350, f1344;
sub.f32 f405, f1350, f1344;
add.f32 f406, f373, f395;
sub.f32 f408, f373, f395;
add.f32 f1339, f1349, f397;
sub.f32 f409, f1349, f397;
add.f32 f410, f371, f388;
sub.f32 f412, f371, f388;
sub.f32 f1338, f372, f387;
add.f32 f413, f372, f387;
add.f32 f414, f375, f400;
sub.f32 f416, f375, f400;
add.f32 f1337, f376, f401;
sub.f32 f417, f376, f401;
mul.f32 f419, f1339, 0fBEC3EF15;
mul.f32 f1336, f406, 0f3F6C835E;
sub.f32 f420, f1336, f419;
mul.f32 f421, f1339, 0f3F6C835E;
fma.rn.f32 f422, f406, 0fBEC3EF15, f421;
mul.f32 f424, f1338, 0fBF3504F3;
mul.f32 f1335, f410, 0f3F3504F3;
sub.f32 f425, f1335, f424;
mul.f32 f426, f1338, 0f3F3504F3;
fma.rn.f32 f427, f410, 0fBF3504F3, f426;
mul.f32 f1333, f414, 0f3EC3EF15;
mul.f32 f1334, f1337, 0fBF6C835E;
sub.f32 f430, f1333, f1334;
mul.f32 f431, f1337, 0f3EC3EF15;
fma.rn.f32 f432, f414, 0fBF6C835E, f431;
mul.f32 f1331, f408, 0fBEC3EF15;
mul.f32 f1332, f409, 0fBF6C835E;
sub.f32 f435, f1331, f1332;
mul.f32 f436, f409, 0fBEC3EF15;
fma.rn.f32 f437, f408, 0fBF6C835E, f436;
mul.f32 f438, f412, 0fBF3504F3;
mul.f32 f439, f413, 0fBF3504F3;
sub.f32 f440, f438, f439;
add.f32 f441, f438, f439;
mul.f32 f443, f417, 0fBEC3EF15;
mul.f32 f1330, f416, 0fBF6C835E;
sub.f32 f444, f1330, f443;
mul.f32 f445, f417, 0fBF6C835E;
fma.rn.f32 f446, f416, 0fBEC3EF15, f445;
add.f32 f447, f345, f402;
sub.f32 f449, f345, f402;
add.f32 f1329, f1358, f1340;
sub.f32 f450, f1358, f1340;
add.f32 f451, f349, f420;
sub.f32 f453, f349, f420;
add.f32 f1328, f1357, f422;
sub.f32 f454, f1357, f422;
add.f32 f455, f353, f425;
sub.f32 f457, f353, f425;
add.f32 f1327, f1356, f427;
sub.f32 f458, f1356, f427;
add.f32 f459, f357, f430;
sub.f32 f461, f357, f430;
add.f32 f1326, f1355, f432;
sub.f32 f462, f1355, f432;
add.f32 f463, f347, f405;
sub.f32 f465, f347, f405;
sub.f32 f1325, f348, f404;
add.f32 f466, f348, f404;
add.f32 f467, f351, f435;
sub.f32 f469, f351, f435;
add.f32 f1324, f352, f437;
sub.f32 f470, f352, f437;
add.f32 f471, f355, f440;
sub.f32 f473, f355, f440;
add.f32 f1323, f356, f441;
sub.f32 f474, f356, f441;
add.f32 f475, f359, f444;
sub.f32 f477, f359, f444;
add.f32 f1322, f360, f446;
sub.f32 f478, f360, f446;
mul.f32 f480, f1328, 0fBE47C5C2;
mul.f32 f1321, f451, 0f3F7B14BE;
sub.f32 f481, f1321, f480;
mul.f32 f482, f1328, 0f3F7B14BE;
fma.rn.f32 f483, f451, 0fBE47C5C2, f482;
mul.f32 f485, f1327, 0fBEC3EF15;
mul.f32 f1320, f455, 0f3F6C835E;
sub.f32 f486, f1320, f485;
mul.f32 f487, f1327, 0f3F6C835E;
fma.rn.f32 f488, f455, 0fBEC3EF15, f487;
mul.f32 f490, f1326, 0fBF0E39DA;
mul.f32 f1319, f459, 0f3F54DB31;
sub.f32 f491, f1319, f490;
mul.f32 f492, f1326, 0f3F54DB31;
fma.rn.f32 f493, f459, 0fBF0E39DA, f492;
mul.f32 f495, f1325, 0fBF3504F3;
mul.f32 f1318, f463, 0f3F3504F3;
sub.f32 f496, f1318, f495;
mul.f32 f497, f1325, 0f3F3504F3;
fma.rn.f32 f498, f463, 0fBF3504F3, f497;
mul.f32 f1316, f467, 0f3F0E39DA;
mul.f32 f1317, f1324, 0fBF54DB31;
sub.f32 f501, f1316, f1317;
mul.f32 f502, f1324, 0f3F0E39DA;
fma.rn.f32 f503, f467, 0fBF54DB31, f502;
mul.f32 f1314, f471, 0f3EC3EF15;
mul.f32 f1315, f1323, 0fBF6C835E;
sub.f32 f506, f1314, f1315;
mul.f32 f507, f1323, 0f3EC3EF15;
fma.rn.f32 f508, f471, 0fBF6C835E, f507;
mul.f32 f1312, f475, 0f3E47C5C2;
mul.f32 f1313, f1322, 0fBF7B14BE;
sub.f32 f511, f1312, f1313;
mul.f32 f512, f1322, 0f3E47C5C2;
fma.rn.f32 f513, f475, 0fBF7B14BE, f512;
mul.f32 f1310, f453, 0fBE47C5C2;
mul.f32 f1311, f454, 0fBF7B14BE;
sub.f32 f516, f1310, f1311;
mul.f32 f517, f454, 0fBE47C5C2;
fma.rn.f32 f518, f453, 0fBF7B14BE, f517;
mul.f32 f520, f458, 0fBF6C835E;
mul.f32 f1309, f457, 0fBEC3EF15;
sub.f32 f521, f1309, f520;
mul.f32 f522, f458, 0fBEC3EF15;
fma.rn.f32 f523, f457, 0fBF6C835E, f522;
mul.f32 f525, f462, 0fBF54DB31;
mul.f32 f1308, f461, 0fBF0E39DA;
sub.f32 f526, f1308, f525;
mul.f32 f527, f462, 0fBF0E39DA;
fma.rn.f32 f528, f461, 0fBF54DB31, f527;
mul.f32 f529, f465, 0fBF3504F3;
mul.f32 f530, f466, 0fBF3504F3;
sub.f32 f531, f529, f530;
add.f32 f532, f529, f530;
mul.f32 f1306, f469, 0fBF54DB31;
mul.f32 f1307, f470, 0fBF0E39DA;
sub.f32 f535, f1306, f1307;
mul.f32 f536, f470, 0fBF54DB31;
fma.rn.f32 f537, f469, 0fBF0E39DA, f536;
mul.f32 f539, f474, 0fBEC3EF15;
mul.f32 f1305, f473, 0fBF6C835E;
sub.f32 f540, f1305, f539;
mul.f32 f541, f474, 0fBF6C835E;
fma.rn.f32 f542, f473, 0fBEC3EF15, f541;
mul.f32 f544, f478, 0fBE47C5C2;
mul.f32 f1304, f477, 0fBF7B14BE;
sub.f32 f545, f1304, f544;
mul.f32 f546, f478, 0fBF7B14BE;
fma.rn.f32 f547, f477, 0fBE47C5C2, f546;
add.f32 f550, f276, f481;
sub.f32 f552, f276, f481;
add.f32 f1303, f1381, f483;
sub.f32 f553, f1381, f483;
add.f32 f554, f280, f486;
sub.f32 f556, f280, f486;
add.f32 f1302, f1380, f488;
sub.f32 f557, f1380, f488;
add.f32 f558, f284, f491;
sub.f32 f560, f284, f491;
add.f32 f1301, f1379, f493;
sub.f32 f561, f1379, f493;
add.f32 f562, f288, f496;
sub.f32 f564, f288, f496;
add.f32 f1300, f1378, f498;
sub.f32 f565, f1378, f498;
add.f32 f566, f292, f501;
sub.f32 f568, f292, f501;
add.f32 f1299, f1377, f503;
sub.f32 f569, f1377, f503;
add.f32 f570, f296, f506;
sub.f32 f572, f296, f506;
add.f32 f1298, f1376, f508;
sub.f32 f573, f1376, f508;
add.f32 f574, f300, f511;
sub.f32 f576, f300, f511;
add.f32 f1297, f1375, f513;
sub.f32 f577, f1375, f513;
add.f32 f578, f274, f450;
sub.f32 f580, f274, f450;
sub.f32 f1296, f275, f449;
add.f32 f581, f275, f449;
add.f32 f582, f278, f516;
sub.f32 f584, f278, f516;
add.f32 f1295, f279, f518;
sub.f32 f585, f279, f518;
add.f32 f586, f282, f521;
sub.f32 f588, f282, f521;
add.f32 f1294, f283, f523;
sub.f32 f589, f283, f523;
add.f32 f590, f286, f526;
sub.f32 f592, f286, f526;
add.f32 f1293, f287, f528;
sub.f32 f593, f287, f528;
add.f32 f594, f290, f531;
sub.f32 f596, f290, f531;
add.f32 f1292, f291, f532;
sub.f32 f597, f291, f532;
add.f32 f598, f294, f535;
sub.f32 f600, f294, f535;
add.f32 f1291, f295, f537;
sub.f32 f601, f295, f537;
add.f32 f602, f298, f540;
sub.f32 f604, f298, f540;
add.f32 f1290, f299, f542;
sub.f32 f605, f299, f542;
add.f32 f606, f302, f545;
sub.f32 f608, f302, f545;
add.f32 f1289, f303, f547;
sub.f32 f609, f303, f547;
mov.u32 r15, %tid.x;
shl.b32 r7, r15, 8;
and.b32 r8, r7, -1024;
add.s32 r9, r4, r8;
and.b32 r14, r15, 3;
shl.b32 r10, r15, 3;
cvt.u64.u32 rd2, r10;
and.b64 rd3, rd2, 24;
mov.u64 rd4, %65;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f610, f611}, [rd5];
mul.f32 f615, f611, f1303;
mul.f32 f616, f610, f1303;
mul.f32 f618, f611, f611;
mul.f32 f1288, f610, f610;
sub.f32 f619, f1288, f618;
mul.f32 f620, f611, f610;
fma.rn.f32 f621, f611, f610, f620;
mul.f32 f623, f621, f1302;
mul.f32 f624, f619, f1302;
mul.f32 f626, f611, f621;
mul.f32 f1287, f610, f619;
sub.f32 f627, f1287, f626;
mul.f32 f1286, f619, f554;
mul.f32 f628, f610, f621;
fma.rn.f32 f629, f611, f619, f628;
mul.f32 f631, f629, f1301;
mul.f32 f632, f627, f1301;
mul.f32 f1284, f610, f627;
mul.f32 f1285, f611, f629;
sub.f32 f635, f1284, f1285;
mul.f32 f1283, f627, f558;
mul.f32 f636, f610, f629;
fma.rn.f32 f637, f611, f627, f636;
mul.f32 f639, f637, f1300;
mul.f32 f640, f635, f1300;
mul.f32 f642, f611, f637;
mul.f32 f1282, f610, f635;
sub.f32 f643, f1282, f642;
mul.f32 f1281, f635, f562;
mul.f32 f644, f610, f637;
fma.rn.f32 f645, f611, f635, f644;
mul.f32 f647, f645, f1299;
mul.f32 f648, f643, f1299;
mul.f32 f1279, f610, f643;
mul.f32 f1280, f611, f645;
sub.f32 f651, f1279, f1280;
mul.f32 f1278, f643, f566;
mul.f32 f652, f610, f645;
fma.rn.f32 f653, f611, f643, f652;
mul.f32 f655, f653, f1298;
mul.f32 f656, f651, f1298;
mul.f32 f658, f611, f653;
mul.f32 f1277, f610, f651;
sub.f32 f659, f1277, f658;
mul.f32 f1276, f651, f570;
mul.f32 f660, f610, f653;
fma.rn.f32 f661, f611, f651, f660;
mul.f32 f663, f661, f1297;
mul.f32 f664, f659, f1297;
mul.f32 f666, f611, f661;
mul.f32 f1275, f610, f659;
sub.f32 f667, f1275, f666;
mul.f32 f1274, f659, f574;
mul.f32 f668, f610, f661;
fma.rn.f32 f669, f611, f659, f668;
mul.f32 f671, f669, f1296;
mul.f32 f672, f667, f1296;
mul.f32 f1272, f610, f667;
mul.f32 f1273, f611, f669;
sub.f32 f675, f1272, f1273;
mul.f32 f1271, f667, f578;
mul.f32 f676, f610, f669;
fma.rn.f32 f677, f611, f667, f676;
mul.f32 f679, f677, f1295;
mul.f32 f680, f675, f1295;
mul.f32 f682, f611, f677;
mul.f32 f1270, f610, f675;
sub.f32 f683, f1270, f682;
mul.f32 f1269, f675, f582;
mul.f32 f684, f610, f677;
fma.rn.f32 f685, f611, f675, f684;
mul.f32 f687, f685, f1294;
mul.f32 f688, f683, f1294;
mul.f32 f690, f611, f685;
mul.f32 f1268, f610, f683;
sub.f32 f691, f1268, f690;
mul.f32 f1267, f683, f586;
mul.f32 f692, f610, f685;
fma.rn.f32 f693, f611, f683, f692;
mul.f32 f695, f693, f1293;
mul.f32 f696, f691, f1293;
mul.f32 f1265, f610, f691;
mul.f32 f1266, f611, f693;
sub.f32 f699, f1265, f1266;
mul.f32 f1264, f691, f590;
mul.f32 f700, f610, f693;
fma.rn.f32 f701, f611, f691, f700;
mul.f32 f703, f701, f1292;
mul.f32 f704, f699, f1292;
mul.f32 f706, f611, f701;
mul.f32 f1263, f610, f699;
sub.f32 f707, f1263, f706;
mul.f32 f1262, f699, f594;
mul.f32 f708, f610, f701;
fma.rn.f32 f709, f611, f699, f708;
mul.f32 f711, f709, f1291;
mul.f32 f712, f707, f1291;
mul.f32 f1260, f610, f707;
mul.f32 f1261, f611, f709;
sub.f32 f715, f1260, f1261;
mul.f32 f1259, f707, f598;
mul.f32 f716, f610, f709;
fma.rn.f32 f717, f611, f707, f716;
mul.f32 f719, f717, f1290;
mul.f32 f720, f715, f1290;
mul.f32 f722, f611, f717;
mul.f32 f1258, f610, f715;
sub.f32 f723, f1258, f722;
mul.f32 f1257, f715, f602;
mul.f32 f724, f610, f717;
fma.rn.f32 f725, f611, f715, f724;
mul.f32 f727, f725, f1289;
mul.f32 f728, f723, f1289;
mul.f32 f730, f611, f725;
mul.f32 f1256, f610, f723;
sub.f32 f731, f1256, f730;
mul.f32 f1255, f723, f606;
mul.f32 f732, f610, f725;
fma.rn.f32 f733, f611, f723, f732;
sub.f32 f1254, f1382, f1329;
mul.f32 f735, f733, f1254;
mul.f32 f736, f731, f1254;
mul.f32 f1252, f610, f731;
mul.f32 f1253, f611, f733;
sub.f32 f739, f1252, f1253;
sub.f32 f1251, f272, f447;
mul.f32 f1250, f731, f1251;
mul.f32 f740, f610, f733;
fma.rn.f32 f741, f611, f731, f740;
mul.f32 f743, f741, f553;
mul.f32 f744, f739, f553;
mul.f32 f746, f611, f741;
mul.f32 f1249, f610, f739;
sub.f32 f747, f1249, f746;
mul.f32 f1248, f739, f552;
mul.f32 f748, f610, f741;
fma.rn.f32 f749, f611, f739, f748;
mul.f32 f751, f749, f557;
mul.f32 f752, f747, f557;
mul.f32 f754, f611, f749;
mul.f32 f1247, f610, f747;
sub.f32 f755, f1247, f754;
mul.f32 f1246, f747, f556;
mul.f32 f756, f610, f749;
fma.rn.f32 f757, f611, f747, f756;
mul.f32 f759, f757, f561;
mul.f32 f760, f755, f561;
mul.f32 f1244, f610, f755;
mul.f32 f1245, f611, f757;
sub.f32 f763, f1244, f1245;
mul.f32 f1243, f755, f560;
mul.f32 f764, f610, f757;
fma.rn.f32 f765, f611, f755, f764;
mul.f32 f767, f765, f565;
mul.f32 f768, f763, f565;
mul.f32 f770, f611, f765;
mul.f32 f1242, f610, f763;
sub.f32 f771, f1242, f770;
mul.f32 f1241, f763, f564;
mul.f32 f772, f610, f765;
fma.rn.f32 f773, f611, f763, f772;
mul.f32 f775, f773, f569;
mul.f32 f776, f771, f569;
mul.f32 f1239, f610, f771;
mul.f32 f1240, f611, f773;
sub.f32 f779, f1239, f1240;
mul.f32 f1238, f771, f568;
mul.f32 f780, f610, f773;
fma.rn.f32 f781, f611, f771, f780;
mul.f32 f783, f781, f573;
mul.f32 f784, f779, f573;
mul.f32 f786, f611, f781;
mul.f32 f1237, f610, f779;
sub.f32 f787, f1237, f786;
mul.f32 f1236, f779, f572;
mul.f32 f788, f610, f781;
fma.rn.f32 f789, f611, f779, f788;
mul.f32 f791, f789, f577;
mul.f32 f792, f787, f577;
mul.f32 f794, f611, f789;
mul.f32 f1235, f610, f787;
sub.f32 f795, f1235, f794;
mul.f32 f1234, f787, f576;
mul.f32 f796, f610, f789;
fma.rn.f32 f797, f611, f787, f796;
mul.f32 f799, f797, f581;
mul.f32 f800, f795, f581;
mul.f32 f1232, f610, f795;
mul.f32 f1233, f611, f797;
sub.f32 f803, f1232, f1233;
mul.f32 f1231, f795, f580;
mul.f32 f804, f610, f797;
fma.rn.f32 f805, f611, f795, f804;
mul.f32 f807, f805, f585;
mul.f32 f808, f803, f585;
mul.f32 f810, f611, f805;
mul.f32 f1230, f610, f803;
sub.f32 f811, f1230, f810;
mul.f32 f1229, f803, f584;
mul.f32 f812, f610, f805;
fma.rn.f32 f813, f611, f803, f812;
mul.f32 f815, f813, f589;
mul.f32 f816, f811, f589;
mul.f32 f818, f611, f813;
mul.f32 f1228, f610, f811;
sub.f32 f819, f1228, f818;
mul.f32 f1227, f811, f588;
mul.f32 f820, f610, f813;
fma.rn.f32 f821, f611, f811, f820;
mul.f32 f823, f821, f593;
mul.f32 f824, f819, f593;
mul.f32 f1225, f610, f819;
mul.f32 f1226, f611, f821;
sub.f32 f827, f1225, f1226;
mul.f32 f1224, f819, f592;
mul.f32 f828, f610, f821;
fma.rn.f32 f829, f611, f819, f828;
mul.f32 f831, f829, f597;
mul.f32 f832, f827, f597;
mul.f32 f834, f611, f829;
mul.f32 f1223, f610, f827;
sub.f32 f835, f1223, f834;
mul.f32 f1222, f827, f596;
mul.f32 f836, f610, f829;
fma.rn.f32 f837, f611, f827, f836;
mul.f32 f839, f837, f601;
mul.f32 f840, f835, f601;
mul.f32 f1220, f610, f835;
mul.f32 f1221, f611, f837;
sub.f32 f843, f1220, f1221;
mul.f32 f1219, f835, f600;
mul.f32 f844, f610, f837;
fma.rn.f32 f845, f611, f835, f844;
mul.f32 f847, f845, f605;
mul.f32 f848, f843, f605;
mul.f32 f850, f611, f845;
mul.f32 f1218, f610, f843;
sub.f32 f851, f1218, f850;
mul.f32 f1217, f610, f550;
mul.f32 f852, f610, f845;
mul.f32 f1216, f843, f604;
fma.rn.f32 f853, f611, f843, f852;
mul.f32 f854, f851, f608;
mul.f32 f855, f853, f609;
mul.f32 f856, f851, f609;
mov.u32 r17, %tid.x;
shl.b32 r16, r17, 8;
barrier.sync 0;
and.b32 r11, r16, 768;
add.s32 r12, r9, r11;
sub.f32 f1431, f1382, f1329;
mul.f32 f1430, f733, f1431;
add.f32 f857, f1382, f1329;
sub.f32 f1429, f272, f447;
add.f32 f858, f272, f447;
mov.u32 r19, %tid.x;
and.b32 r18, r19, 3;
mov.u32 r21, %tid.x;
and.b32 r20, r21, 3;
fma.rn.f32 f859, f611, f550, f616;
sub.f32 f860, f1217, f615;
st.shared.v4.f32 [r12], {f858, f857, f860, f859};
fma.rn.f32 f861, f621, f554, f624;
sub.f32 f862, f1286, f623;
fma.rn.f32 f863, f629, f558, f632;
sub.f32 f864, f1283, f631;
st.shared.v4.f32 [r12+16], {f862, f861, f864, f863};
fma.rn.f32 f865, f637, f562, f640;
sub.f32 f866, f1281, f639;
sub.f32 f867, f1278, f647;
fma.rn.f32 f868, f645, f566, f648;
st.shared.v4.f32 [r12+32], {f866, f865, f867, f868};
fma.rn.f32 f869, f653, f570, f656;
sub.f32 f870, f1276, f655;
fma.rn.f32 f871, f661, f574, f664;
sub.f32 f872, f1274, f663;
st.shared.v4.f32 [r12+48], {f870, f869, f872, f871};
fma.rn.f32 f873, f669, f578, f672;
sub.f32 f874, f1271, f671;
fma.rn.f32 f875, f677, f582, f680;
sub.f32 f876, f1269, f679;
st.shared.v4.f32 [r12+64], {f874, f873, f876, f875};
fma.rn.f32 f877, f685, f586, f688;
sub.f32 f878, f1267, f687;
fma.rn.f32 f879, f693, f590, f696;
sub.f32 f880, f1264, f695;
st.shared.v4.f32 [r12+80], {f878, f877, f880, f879};
fma.rn.f32 f881, f701, f594, f704;
sub.f32 f882, f1262, f703;
fma.rn.f32 f883, f709, f598, f712;
sub.f32 f884, f1259, f711;
st.shared.v4.f32 [r12+96], {f882, f881, f884, f883};
fma.rn.f32 f885, f717, f602, f720;
sub.f32 f886, f1257, f719;
fma.rn.f32 f887, f725, f606, f728;
sub.f32 f888, f1255, f727;
st.shared.v4.f32 [r12+112], {f886, f885, f888, f887};
fma.rn.f32 f889, f733, f1429, f736;
sub.f32 f890, f1250, f1430;
fma.rn.f32 f891, f741, f552, f744;
sub.f32 f892, f1248, f743;
st.shared.v4.f32 [r12+128], {f890, f889, f892, f891};
fma.rn.f32 f893, f749, f556, f752;
sub.f32 f894, f1246, f751;
fma.rn.f32 f895, f757, f560, f760;
sub.f32 f896, f1243, f759;
st.shared.v4.f32 [r12+144], {f894, f893, f896, f895};
fma.rn.f32 f897, f765, f564, f768;
sub.f32 f898, f1241, f767;
fma.rn.f32 f899, f773, f568, f776;
sub.f32 f900, f1238, f775;
st.shared.v4.f32 [r12+160], {f898, f897, f900, f899};
fma.rn.f32 f901, f781, f572, f784;
sub.f32 f902, f1236, f783;
fma.rn.f32 f903, f789, f576, f792;
sub.f32 f904, f1234, f791;
st.shared.v4.f32 [r12+176], {f902, f901, f904, f903};
fma.rn.f32 f905, f797, f580, f800;
sub.f32 f906, f1231, f799;
fma.rn.f32 f907, f805, f584, f808;
sub.f32 f908, f1229, f807;
st.shared.v4.f32 [r12+192], {f906, f905, f908, f907};
fma.rn.f32 f909, f813, f588, f816;
sub.f32 f910, f1227, f815;
fma.rn.f32 f911, f821, f592, f824;
sub.f32 f912, f1224, f823;
st.shared.v4.f32 [r12+208], {f910, f909, f912, f911};
fma.rn.f32 f913, f829, f596, f832;
sub.f32 f914, f1222, f831;
fma.rn.f32 f915, f837, f600, f840;
sub.f32 f916, f1219, f839;
st.shared.v4.f32 [r12+224], {f914, f913, f916, f915};
fma.rn.f32 f917, f845, f604, f848;
sub.f32 f918, f1216, f847;
fma.rn.f32 f919, f853, f608, f856;
sub.f32 f920, f854, f855;
st.shared.v4.f32 [r12+240], {f918, f917, f920, f919};
barrier.sync 0;
mad.lo.s32 r13, r20, -248, r12;
ld.shared.v2.f32 {f921, f922}, [r13];
ld.shared.v2.f32 {f925, f926}, [r13+32];
ld.shared.v2.f32 {f929, f930}, [r13+64];
ld.shared.v2.f32 {f933, f934}, [r13+96];
ld.shared.v2.f32 {f937, f938}, [r13+128];
ld.shared.v2.f32 {f941, f942}, [r13+160];
ld.shared.v2.f32 {f945, f946}, [r13+192];
ld.shared.v2.f32 {f949, f950}, [r13+224];
ld.shared.v2.f32 {f953, f954}, [r13+256];
ld.shared.v2.f32 {f957, f958}, [r13+288];
ld.shared.v2.f32 {f961, f962}, [r13+320];
ld.shared.v2.f32 {f965, f966}, [r13+352];
ld.shared.v2.f32 {f969, f970}, [r13+384];
ld.shared.v2.f32 {f973, f974}, [r13+416];
ld.shared.v2.f32 {f977, f978}, [r13+448];
ld.shared.v2.f32 {f981, f982}, [r13+480];
ld.shared.v2.f32 {f985, f986}, [r13+512];
ld.shared.v2.f32 {f989, f990}, [r13+544];
ld.shared.v2.f32 {f993, f994}, [r13+576];
ld.shared.v2.f32 {f997, f998}, [r13+608];
ld.shared.v2.f32 {f1001, f1002}, [r13+640];
ld.shared.v2.f32 {f1005, f1006}, [r13+672];
ld.shared.v2.f32 {f1009, f1010}, [r13+704];
ld.shared.v2.f32 {f1013, f1014}, [r13+736];
ld.shared.v2.f32 {f1017, f1018}, [r13+768];
ld.shared.v2.f32 {f1021, f1022}, [r13+800];
ld.shared.v2.f32 {f1025, f1026}, [r13+832];
ld.shared.v2.f32 {f1029, f1030}, [r13+864];
ld.shared.v2.f32 {f1033, f1034}, [r13+896];
ld.shared.v2.f32 {f1037, f1038}, [r13+928];
ld.shared.v2.f32 {f1041, f1042}, [r13+960];
ld.shared.v2.f32 {f1045, f1046}, [r13+992];
add.f32 f1049, f921, f985;
sub.f32 f1051, f921, f985;
add.f32 f1215, f922, f986;
sub.f32 f1052, f922, f986;
add.f32 f1053, f953, f1017;
sub.f32 f1055, f953, f1017;
add.f32 f1214, f954, f1018;
sub.f32 f1056, f954, f1018;
add.f32 f1057, f925, f989;
sub.f32 f1059, f925, f989;
add.f32 f1213, f926, f990;
sub.f32 f1060, f926, f990;
add.f32 f1061, f957, f1021;
sub.f32 f1063, f957, f1021;
add.f32 f1212, f958, f1022;
sub.f32 f1064, f958, f1022;
add.f32 f1065, f929, f993;
sub.f32 f1067, f929, f993;
add.f32 f1211, f930, f994;
sub.f32 f1068, f930, f994;
add.f32 f1069, f961, f1025;
sub.f32 f1071, f961, f1025;
add.f32 f1210, f962, f1026;
sub.f32 f1072, f962, f1026;
add.f32 f1073, f933, f997;
sub.f32 f1075, f933, f997;
add.f32 f1209, f934, f998;
sub.f32 f1076, f934, f998;
add.f32 f1077, f965, f1029;
sub.f32 f1079, f965, f1029;
add.f32 f1208, f966, f1030;
sub.f32 f1080, f966, f1030;
add.f32 f1081, f937, f1001;
sub.f32 f1083, f937, f1001;
add.f32 f1207, f938, f1002;
sub.f32 f1084, f938, f1002;
add.f32 f1085, f969, f1033;
sub.f32 f1087, f969, f1033;
add.f32 f1206, f970, f1034;
sub.f32 f1088, f970, f1034;
add.f32 f1089, f941, f1005;
sub.f32 f1091, f941, f1005;
add.f32 f1205, f942, f1006;
sub.f32 f1092, f942, f1006;
add.f32 f1093, f973, f1037;
sub.f32 f1095, f973, f1037;
add.f32 f1204, f974, f1038;
sub.f32 f1096, f974, f1038;
add.f32 f1097, f945, f1009;
sub.f32 f1099, f945, f1009;
add.f32 f1203, f946, f1010;
sub.f32 f1100, f946, f1010;
add.f32 f1101, f977, f1041;
sub.f32 f1103, f977, f1041;
add.f32 f1202, f978, f1042;
sub.f32 f1104, f978, f1042;
add.f32 f1105, f949, f1013;
sub.f32 f1107, f949, f1013;
add.f32 f1201, f950, f1014;
sub.f32 f1108, f950, f1014;
add.f32 f1109, f981, f1045;
sub.f32 f1111, f981, f1045;
add.f32 f1200, f982, f1046;
sub.f32 f1112, f982, f1046;
add.f32 %1, f1215, f1214;
add.f32 %0, f1049, f1053;
add.f32 %2, f1057, f1061;
add.f32 %3, f1213, f1212;
add.f32 %4, f1065, f1069;
add.f32 %5, f1211, f1210;
add.f32 %6, f1073, f1077;
add.f32 %7, f1209, f1208;
add.f32 %8, f1081, f1085;
add.f32 %9, f1207, f1206;
add.f32 %11, f1205, f1204;
add.f32 %10, f1089, f1093;
add.f32 %13, f1203, f1202;
add.f32 %12, f1097, f1101;
add.f32 %15, f1201, f1200;
add.f32 %14, f1105, f1109;
add.f32 %16, f1051, f1056;
sub.f32 %17, f1052, f1055;
add.f32 %18, f1059, f1064;
sub.f32 %19, f1060, f1063;
add.f32 %20, f1067, f1072;
sub.f32 %21, f1068, f1071;
sub.f32 %23, f1076, f1079;
add.f32 %22, f1075, f1080;
sub.f32 %25, f1084, f1087;
add.f32 %24, f1083, f1088;
sub.f32 %27, f1092, f1095;
add.f32 %26, f1091, f1096;
add.f32 %28, f1099, f1104;
sub.f32 %29, f1100, f1103;
add.f32 %30, f1107, f1112;
sub.f32 %31, f1108, f1111;
sub.f32 %33, f1215, f1214;
sub.f32 %32, f1049, f1053;
sub.f32 %35, f1213, f1212;
sub.f32 %34, f1057, f1061;
sub.f32 %37, f1211, f1210;
sub.f32 %36, f1065, f1069;
sub.f32 %39, f1209, f1208;
sub.f32 %38, f1073, f1077;
sub.f32 %41, f1207, f1206;
sub.f32 %40, f1081, f1085;
sub.f32 %43, f1205, f1204;
sub.f32 %42, f1089, f1093;
sub.f32 %45, f1203, f1202;
sub.f32 %44, f1097, f1101;
sub.f32 %47, f1201, f1200;
sub.f32 %46, f1105, f1109;
add.f32 %49, f1052, f1055;
sub.f32 %48, f1051, f1056;
add.f32 %51, f1060, f1063;
sub.f32 %50, f1059, f1064;
add.f32 %53, f1068, f1071;
sub.f32 %52, f1067, f1072;
add.f32 %55, f1076, f1079;
sub.f32 %54, f1075, f1080;
add.f32 %57, f1084, f1087;
sub.f32 %56, f1083, f1088;
add.f32 %59, f1092, f1095;
sub.f32 %58, f1091, f1096;
add.f32 %61, f1100, f1103;
sub.f32 %60, f1099, f1104;
add.f32 %63, f1108, f1111;
sub.f32 %62, f1107, f1112;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_128), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[20].y), "f"(rmem[4].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[23].y), "f"(rmem[7].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<58, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<1381>;
.reg .b32 r<18>;
.reg .b64 rd<9>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 9;
mov.u32 r3, %64;
add.s32 r4, r3, r2;
add.f32 f129, %66, %98;
sub.f32 f131, %66, %98;
add.f32 f1379, %67, %130;
sub.f32 f132, %67, %130;
add.f32 f133, %82, %114;
sub.f32 f135, %82, %114;
add.f32 f1377, %131, %115;
sub.f32 f136, %131, %115;
add.f32 f137, f129, f133;
sub.f32 f139, f129, f133;
add.f32 f1376, f1379, f1377;
sub.f32 f140, f1379, f1377;
add.f32 f141, f131, f136;
sub.f32 f143, f131, f136;
sub.f32 f1375, f132, f135;
add.f32 f144, f132, f135;
add.f32 f145, %74, %106;
sub.f32 f147, %74, %106;
add.f32 f1372, %132, %133;
sub.f32 f148, %132, %133;
add.f32 f149, %90, %122;
sub.f32 f151, %90, %122;
add.f32 f1370, %91, %134;
sub.f32 f152, %91, %134;
add.f32 f153, f145, f149;
sub.f32 f155, f145, f149;
add.f32 f1369, f1372, f1370;
sub.f32 f156, f1372, f1370;
add.f32 f157, f147, f152;
sub.f32 f159, f147, f152;
sub.f32 f1368, f148, f151;
add.f32 f160, f148, f151;
mul.f32 f162, f1368, 0fBF3504F3;
mul.f32 f1367, f157, 0f3F3504F3;
sub.f32 f163, f1367, f162;
mul.f32 f164, f1368, 0f3F3504F3;
fma.rn.f32 f165, f157, 0fBF3504F3, f164;
mul.f32 f166, f159, 0fBF3504F3;
mul.f32 f167, f160, 0fBF3504F3;
sub.f32 f168, f166, f167;
add.f32 f169, f166, f167;
add.f32 f170, f137, f153;
sub.f32 f172, f137, f153;
add.f32 f1366, f1376, f1369;
sub.f32 f173, f1376, f1369;
add.f32 f174, f141, f163;
sub.f32 f176, f141, f163;
add.f32 f1365, f1375, f165;
sub.f32 f177, f1375, f165;
add.f32 f178, f139, f156;
sub.f32 f180, f139, f156;
sub.f32 f1364, f140, f155;
add.f32 f181, f140, f155;
add.f32 f182, f143, f168;
sub.f32 f184, f143, f168;
add.f32 f1363, f144, f169;
sub.f32 f185, f144, f169;
add.f32 f186, %70, %102;
sub.f32 f188, %70, %102;
add.f32 f1361, %135, %103;
sub.f32 f189, %135, %103;
add.f32 f190, %86, %118;
sub.f32 f192, %86, %118;
add.f32 f1358, %137, %136;
sub.f32 f193, %137, %136;
add.f32 f194, f186, f190;
sub.f32 f196, f186, f190;
add.f32 f1357, f1361, f1358;
sub.f32 f197, f1361, f1358;
add.f32 f198, f188, f193;
sub.f32 f200, f188, f193;
sub.f32 f1356, f189, f192;
add.f32 f201, f189, f192;
add.f32 f202, %78, %110;
sub.f32 f204, %78, %110;
add.f32 f1354, %79, %138;
sub.f32 f205, %79, %138;
add.f32 f206, %94, %126;
sub.f32 f208, %94, %126;
add.f32 f1352, %139, %127;
sub.f32 f209, %139, %127;
add.f32 f210, f202, f206;
sub.f32 f212, f202, f206;
add.f32 f1351, f1354, f1352;
sub.f32 f213, f1354, f1352;
add.f32 f214, f204, f209;
sub.f32 f216, f204, f209;
sub.f32 f1350, f205, f208;
add.f32 f217, f205, f208;
mul.f32 f219, f1350, 0fBF3504F3;
mul.f32 f1349, f214, 0f3F3504F3;
sub.f32 f220, f1349, f219;
mul.f32 f221, f1350, 0f3F3504F3;
fma.rn.f32 f222, f214, 0fBF3504F3, f221;
mul.f32 f223, f216, 0fBF3504F3;
mul.f32 f224, f217, 0fBF3504F3;
sub.f32 f225, f223, f224;
add.f32 f226, f223, f224;
add.f32 f227, f194, f210;
sub.f32 f229, f194, f210;
add.f32 f1348, f1357, f1351;
sub.f32 f230, f1357, f1351;
add.f32 f231, f198, f220;
sub.f32 f233, f198, f220;
add.f32 f1347, f1356, f222;
sub.f32 f234, f1356, f222;
add.f32 f235, f196, f213;
sub.f32 f237, f196, f213;
sub.f32 f1346, f197, f212;
add.f32 f238, f197, f212;
add.f32 f239, f200, f225;
sub.f32 f241, f200, f225;
add.f32 f1345, f201, f226;
sub.f32 f242, f201, f226;
mul.f32 f1343, f231, 0f3F6C835E;
mul.f32 f1344, f1347, 0fBEC3EF15;
sub.f32 f245, f1343, f1344;
mul.f32 f246, f1347, 0f3F6C835E;
fma.rn.f32 f247, f231, 0fBEC3EF15, f246;
mul.f32 f1341, f235, 0f3F3504F3;
mul.f32 f1342, f1346, 0fBF3504F3;
sub.f32 f250, f1341, f1342;
mul.f32 f251, f1346, 0f3F3504F3;
fma.rn.f32 f252, f235, 0fBF3504F3, f251;
mul.f32 f1339, f239, 0f3EC3EF15;
mul.f32 f1340, f1345, 0fBF6C835E;
sub.f32 f255, f1339, f1340;
mul.f32 f256, f1345, 0f3EC3EF15;
fma.rn.f32 f257, f239, 0fBF6C835E, f256;
mul.f32 f1337, f233, 0fBEC3EF15;
mul.f32 f1338, f234, 0fBF6C835E;
sub.f32 f260, f1337, f1338;
mul.f32 f261, f234, 0fBEC3EF15;
fma.rn.f32 f262, f233, 0fBF6C835E, f261;
mul.f32 f263, f237, 0fBF3504F3;
mul.f32 f264, f238, 0fBF3504F3;
sub.f32 f265, f263, f264;
add.f32 f266, f263, f264;
mul.f32 f1335, f241, 0fBF6C835E;
mul.f32 f1336, f242, 0fBEC3EF15;
sub.f32 f269, f1335, f1336;
mul.f32 f270, f242, 0fBF6C835E;
fma.rn.f32 f271, f241, 0fBEC3EF15, f270;
add.f32 f272, f170, f227;
sub.f32 f274, f170, f227;
add.f32 f1334, f1366, f1348;
sub.f32 f275, f1366, f1348;
add.f32 f276, f174, f245;
sub.f32 f278, f174, f245;
add.f32 f1333, f1365, f247;
sub.f32 f279, f1365, f247;
add.f32 f280, f178, f250;
sub.f32 f282, f178, f250;
add.f32 f1332, f1364, f252;
sub.f32 f283, f1364, f252;
add.f32 f284, f182, f255;
sub.f32 f286, f182, f255;
add.f32 f1331, f1363, f257;
sub.f32 f287, f1363, f257;
add.f32 f288, f172, f230;
sub.f32 f290, f172, f230;
sub.f32 f1330, f173, f229;
add.f32 f291, f173, f229;
add.f32 f292, f176, f260;
sub.f32 f294, f176, f260;
add.f32 f1329, f177, f262;
sub.f32 f295, f177, f262;
add.f32 f296, f180, f265;
sub.f32 f298, f180, f265;
add.f32 f1328, f181, f266;
sub.f32 f299, f181, f266;
add.f32 f300, f184, f269;
sub.f32 f302, f184, f269;
add.f32 f1327, f185, f271;
sub.f32 f303, f185, f271;
add.f32 f304, %68, %100;
sub.f32 f306, %68, %100;
add.f32 f1324, %141, %140;
sub.f32 f307, %141, %140;
add.f32 f308, %84, %116;
sub.f32 f310, %84, %116;
add.f32 f1322, %85, %142;
sub.f32 f311, %85, %142;
add.f32 f312, f304, f308;
sub.f32 f314, f304, f308;
add.f32 f1321, f1324, f1322;
sub.f32 f315, f1324, f1322;
add.f32 f316, f306, f311;
sub.f32 f318, f306, f311;
sub.f32 f1320, f307, f310;
add.f32 f319, f307, f310;
add.f32 f320, %76, %108;
sub.f32 f322, %76, %108;
add.f32 f1318, %143, %109;
sub.f32 f323, %143, %109;
add.f32 f324, %92, %124;
sub.f32 f326, %92, %124;
add.f32 f1315, %145, %144;
sub.f32 f327, %145, %144;
add.f32 f328, f320, f324;
sub.f32 f330, f320, f324;
add.f32 f1314, f1318, f1315;
sub.f32 f331, f1318, f1315;
add.f32 f332, f322, f327;
sub.f32 f334, f322, f327;
sub.f32 f1313, f323, f326;
add.f32 f335, f323, f326;
mul.f32 f1311, f332, 0f3F3504F3;
mul.f32 f1312, f1313, 0fBF3504F3;
sub.f32 f338, f1311, f1312;
mul.f32 f339, f1313, 0f3F3504F3;
fma.rn.f32 f340, f332, 0fBF3504F3, f339;
mul.f32 f341, f334, 0fBF3504F3;
mul.f32 f342, f335, 0fBF3504F3;
sub.f32 f343, f341, f342;
add.f32 f344, f341, f342;
add.f32 f345, f312, f328;
sub.f32 f347, f312, f328;
add.f32 f1310, f1321, f1314;
sub.f32 f348, f1321, f1314;
add.f32 f349, f316, f338;
sub.f32 f351, f316, f338;
add.f32 f1309, f1320, f340;
sub.f32 f352, f1320, f340;
add.f32 f353, f314, f331;
sub.f32 f355, f314, f331;
sub.f32 f1308, f315, f330;
add.f32 f356, f315, f330;
add.f32 f357, f318, f343;
sub.f32 f359, f318, f343;
add.f32 f1307, f319, f344;
sub.f32 f360, f319, f344;
add.f32 f361, %72, %104;
sub.f32 f363, %72, %104;
add.f32 f1305, %73, %146;
sub.f32 f364, %73, %146;
add.f32 f365, %88, %120;
sub.f32 f367, %88, %120;
add.f32 f1303, %147, %121;
sub.f32 f368, %147, %121;
add.f32 f369, f361, f365;
sub.f32 f371, f361, f365;
add.f32 f1302, f1305, f1303;
sub.f32 f372, f1305, f1303;
add.f32 f373, f363, f368;
sub.f32 f375, f363, f368;
sub.f32 f1301, f364, f367;
add.f32 f376, f364, f367;
add.f32 f377, %80, %112;
sub.f32 f379, %80, %112;
add.f32 f1298, %148, %149;
sub.f32 f380, %148, %149;
add.f32 f381, %96, %128;
sub.f32 f383, %96, %128;
add.f32 f1297, %97, %129;
sub.f32 f384, %97, %129;
add.f32 f385, f377, f381;
sub.f32 f387, f377, f381;
add.f32 f1296, f1298, f1297;
sub.f32 f388, f1298, f1297;
add.f32 f389, f379, f384;
sub.f32 f391, f379, f384;
sub.f32 f1295, f380, f383;
add.f32 f392, f380, f383;
mul.f32 f1293, f389, 0f3F3504F3;
mul.f32 f1294, f1295, 0fBF3504F3;
sub.f32 f395, f1293, f1294;
mul.f32 f396, f1295, 0f3F3504F3;
fma.rn.f32 f397, f389, 0fBF3504F3, f396;
mul.f32 f398, f391, 0fBF3504F3;
mul.f32 f399, f392, 0fBF3504F3;
sub.f32 f400, f398, f399;
add.f32 f401, f398, f399;
add.f32 f402, f369, f385;
sub.f32 f404, f369, f385;
add.f32 f1292, f1302, f1296;
sub.f32 f405, f1302, f1296;
add.f32 f406, f373, f395;
sub.f32 f408, f373, f395;
add.f32 f1291, f1301, f397;
sub.f32 f409, f1301, f397;
add.f32 f410, f371, f388;
sub.f32 f412, f371, f388;
sub.f32 f1290, f372, f387;
add.f32 f413, f372, f387;
add.f32 f414, f375, f400;
sub.f32 f416, f375, f400;
add.f32 f1289, f376, f401;
sub.f32 f417, f376, f401;
mul.f32 f419, f1291, 0fBEC3EF15;
mul.f32 f1288, f406, 0f3F6C835E;
sub.f32 f420, f1288, f419;
mul.f32 f421, f1291, 0f3F6C835E;
fma.rn.f32 f422, f406, 0fBEC3EF15, f421;
mul.f32 f424, f1290, 0fBF3504F3;
mul.f32 f1287, f410, 0f3F3504F3;
sub.f32 f425, f1287, f424;
mul.f32 f426, f1290, 0f3F3504F3;
fma.rn.f32 f427, f410, 0fBF3504F3, f426;
mul.f32 f1285, f414, 0f3EC3EF15;
mul.f32 f1286, f1289, 0fBF6C835E;
sub.f32 f430, f1285, f1286;
mul.f32 f431, f1289, 0f3EC3EF15;
fma.rn.f32 f432, f414, 0fBF6C835E, f431;
mul.f32 f1283, f408, 0fBEC3EF15;
mul.f32 f1284, f409, 0fBF6C835E;
sub.f32 f435, f1283, f1284;
mul.f32 f436, f409, 0fBEC3EF15;
fma.rn.f32 f437, f408, 0fBF6C835E, f436;
mul.f32 f438, f412, 0fBF3504F3;
mul.f32 f439, f413, 0fBF3504F3;
sub.f32 f440, f438, f439;
add.f32 f441, f438, f439;
mul.f32 f443, f417, 0fBEC3EF15;
mul.f32 f1282, f416, 0fBF6C835E;
sub.f32 f444, f1282, f443;
mul.f32 f445, f417, 0fBF6C835E;
fma.rn.f32 f446, f416, 0fBEC3EF15, f445;
add.f32 f447, f345, f402;
sub.f32 f449, f345, f402;
add.f32 f1281, f1310, f1292;
sub.f32 f450, f1310, f1292;
add.f32 f451, f349, f420;
sub.f32 f453, f349, f420;
add.f32 f1280, f1309, f422;
sub.f32 f454, f1309, f422;
add.f32 f455, f353, f425;
sub.f32 f457, f353, f425;
add.f32 f1279, f1308, f427;
sub.f32 f458, f1308, f427;
add.f32 f459, f357, f430;
sub.f32 f461, f357, f430;
add.f32 f1278, f1307, f432;
sub.f32 f462, f1307, f432;
add.f32 f463, f347, f405;
sub.f32 f465, f347, f405;
sub.f32 f1277, f348, f404;
add.f32 f466, f348, f404;
add.f32 f467, f351, f435;
sub.f32 f469, f351, f435;
add.f32 f1276, f352, f437;
sub.f32 f470, f352, f437;
add.f32 f471, f355, f440;
sub.f32 f473, f355, f440;
add.f32 f1275, f356, f441;
sub.f32 f474, f356, f441;
add.f32 f475, f359, f444;
sub.f32 f477, f359, f444;
add.f32 f1274, f360, f446;
sub.f32 f478, f360, f446;
mul.f32 f480, f1280, 0fBE47C5C2;
mul.f32 f1273, f451, 0f3F7B14BE;
sub.f32 f481, f1273, f480;
mul.f32 f482, f1280, 0f3F7B14BE;
fma.rn.f32 f483, f451, 0fBE47C5C2, f482;
mul.f32 f485, f1279, 0fBEC3EF15;
mul.f32 f1272, f455, 0f3F6C835E;
sub.f32 f486, f1272, f485;
mul.f32 f487, f1279, 0f3F6C835E;
fma.rn.f32 f488, f455, 0fBEC3EF15, f487;
mul.f32 f490, f1278, 0fBF0E39DA;
mul.f32 f1271, f459, 0f3F54DB31;
sub.f32 f491, f1271, f490;
mul.f32 f492, f1278, 0f3F54DB31;
fma.rn.f32 f493, f459, 0fBF0E39DA, f492;
mul.f32 f495, f1277, 0fBF3504F3;
mul.f32 f1270, f463, 0f3F3504F3;
sub.f32 f496, f1270, f495;
mul.f32 f497, f1277, 0f3F3504F3;
fma.rn.f32 f498, f463, 0fBF3504F3, f497;
mul.f32 f1268, f467, 0f3F0E39DA;
mul.f32 f1269, f1276, 0fBF54DB31;
sub.f32 f501, f1268, f1269;
mul.f32 f502, f1276, 0f3F0E39DA;
fma.rn.f32 f503, f467, 0fBF54DB31, f502;
mul.f32 f1266, f471, 0f3EC3EF15;
mul.f32 f1267, f1275, 0fBF6C835E;
sub.f32 f506, f1266, f1267;
mul.f32 f507, f1275, 0f3EC3EF15;
fma.rn.f32 f508, f471, 0fBF6C835E, f507;
mul.f32 f1264, f475, 0f3E47C5C2;
mul.f32 f1265, f1274, 0fBF7B14BE;
sub.f32 f511, f1264, f1265;
mul.f32 f512, f1274, 0f3E47C5C2;
fma.rn.f32 f513, f475, 0fBF7B14BE, f512;
mul.f32 f1262, f453, 0fBE47C5C2;
mul.f32 f1263, f454, 0fBF7B14BE;
sub.f32 f516, f1262, f1263;
mul.f32 f517, f454, 0fBE47C5C2;
fma.rn.f32 f518, f453, 0fBF7B14BE, f517;
mul.f32 f520, f458, 0fBF6C835E;
mul.f32 f1261, f457, 0fBEC3EF15;
sub.f32 f521, f1261, f520;
mul.f32 f522, f458, 0fBEC3EF15;
fma.rn.f32 f523, f457, 0fBF6C835E, f522;
mul.f32 f525, f462, 0fBF54DB31;
mul.f32 f1260, f461, 0fBF0E39DA;
sub.f32 f526, f1260, f525;
mul.f32 f527, f462, 0fBF0E39DA;
fma.rn.f32 f528, f461, 0fBF54DB31, f527;
mul.f32 f529, f465, 0fBF3504F3;
mul.f32 f530, f466, 0fBF3504F3;
sub.f32 f531, f529, f530;
add.f32 f532, f529, f530;
mul.f32 f1258, f469, 0fBF54DB31;
mul.f32 f1259, f470, 0fBF0E39DA;
sub.f32 f535, f1258, f1259;
mul.f32 f536, f470, 0fBF54DB31;
fma.rn.f32 f537, f469, 0fBF0E39DA, f536;
mul.f32 f539, f474, 0fBEC3EF15;
mul.f32 f1257, f473, 0fBF6C835E;
sub.f32 f540, f1257, f539;
mul.f32 f541, f474, 0fBF6C835E;
fma.rn.f32 f542, f473, 0fBEC3EF15, f541;
mul.f32 f544, f478, 0fBE47C5C2;
mul.f32 f1256, f477, 0fBF7B14BE;
sub.f32 f545, f1256, f544;
mul.f32 f546, f478, 0fBF7B14BE;
fma.rn.f32 f547, f477, 0fBE47C5C2, f546;
add.f32 f548, f272, f447;
sub.f32 f550, f272, f447;
add.f32 f1255, f1334, f1281;
sub.f32 f551, f1334, f1281;
add.f32 f552, f276, f481;
sub.f32 f554, f276, f481;
add.f32 f1254, f1333, f483;
sub.f32 f555, f1333, f483;
add.f32 f556, f280, f486;
sub.f32 f558, f280, f486;
add.f32 f1253, f1332, f488;
sub.f32 f559, f1332, f488;
add.f32 f560, f284, f491;
sub.f32 f562, f284, f491;
add.f32 f1252, f1331, f493;
sub.f32 f563, f1331, f493;
add.f32 f564, f288, f496;
sub.f32 f566, f288, f496;
add.f32 f1251, f1330, f498;
sub.f32 f567, f1330, f498;
add.f32 f568, f292, f501;
sub.f32 f570, f292, f501;
add.f32 f1250, f1329, f503;
sub.f32 f571, f1329, f503;
add.f32 f572, f296, f506;
sub.f32 f574, f296, f506;
add.f32 f1249, f1328, f508;
sub.f32 f575, f1328, f508;
add.f32 f576, f300, f511;
sub.f32 f578, f300, f511;
add.f32 f1248, f1327, f513;
sub.f32 f579, f1327, f513;
add.f32 f580, f274, f450;
sub.f32 f582, f274, f450;
sub.f32 f1247, f275, f449;
add.f32 f583, f275, f449;
add.f32 f584, f278, f516;
sub.f32 f586, f278, f516;
add.f32 f1246, f279, f518;
sub.f32 f587, f279, f518;
add.f32 f588, f282, f521;
sub.f32 f590, f282, f521;
add.f32 f1245, f283, f523;
sub.f32 f591, f283, f523;
add.f32 f592, f286, f526;
sub.f32 f594, f286, f526;
add.f32 f1244, f287, f528;
sub.f32 f595, f287, f528;
add.f32 f596, f290, f531;
sub.f32 f598, f290, f531;
add.f32 f1243, f291, f532;
sub.f32 f599, f291, f532;
add.f32 f600, f294, f535;
sub.f32 f602, f294, f535;
add.f32 f1242, f295, f537;
sub.f32 f603, f295, f537;
add.f32 f604, f298, f540;
sub.f32 f606, f298, f540;
add.f32 f1241, f299, f542;
sub.f32 f607, f299, f542;
add.f32 f608, f302, f545;
sub.f32 f610, f302, f545;
add.f32 f1240, f303, f547;
sub.f32 f611, f303, f547;
mov.u32 r15, %tid.x;
shl.b32 r7, r15, 3;
cvt.u64.u32 rd2, r7;
and.b64 rd3, rd2, 24;
mov.u64 rd4, %65;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f612, f613}, [rd5];
mul.f32 f617, f613, f1254;
mul.f32 f1239, f612, f552;
sub.f32 f618, f1239, f617;
mul.f32 f619, f612, f1254;
fma.rn.f32 f620, f613, f552, f619;
mul.f32 f1237, f612, f612;
mul.f32 f1238, f613, f613;
sub.f32 f623, f1237, f1238;
mul.f32 f624, f613, f612;
fma.rn.f32 f625, f613, f612, f624;
mul.f32 f1235, f623, f556;
mul.f32 f1236, f625, f1253;
sub.f32 f628, f1235, f1236;
mul.f32 f629, f623, f1253;
fma.rn.f32 f630, f625, f556, f629;
mul.f32 f1233, f612, f623;
mul.f32 f1234, f613, f625;
sub.f32 f633, f1233, f1234;
mul.f32 f634, f612, f625;
fma.rn.f32 f635, f613, f623, f634;
mul.f32 f1231, f633, f560;
mul.f32 f1232, f635, f1252;
sub.f32 f638, f1231, f1232;
mul.f32 f639, f633, f1252;
fma.rn.f32 f640, f635, f560, f639;
mul.f32 f642, f613, f635;
mul.f32 f1230, f612, f633;
sub.f32 f643, f1230, f642;
mul.f32 f644, f612, f635;
fma.rn.f32 f645, f613, f633, f644;
mul.f32 f647, f645, f1251;
mul.f32 f1229, f643, f564;
sub.f32 f648, f1229, f647;
mul.f32 f649, f643, f1251;
fma.rn.f32 f650, f645, f564, f649;
mul.f32 f652, f613, f645;
mul.f32 f1228, f612, f643;
sub.f32 f653, f1228, f652;
mul.f32 f654, f612, f645;
fma.rn.f32 f655, f613, f643, f654;
mul.f32 f657, f655, f1250;
mul.f32 f1227, f653, f568;
sub.f32 f658, f1227, f657;
mul.f32 f659, f653, f1250;
fma.rn.f32 f660, f655, f568, f659;
mul.f32 f662, f613, f655;
mul.f32 f1226, f612, f653;
sub.f32 f663, f1226, f662;
mul.f32 f664, f612, f655;
fma.rn.f32 f665, f613, f653, f664;
mul.f32 f1224, f663, f572;
mul.f32 f1225, f665, f1249;
sub.f32 f668, f1224, f1225;
mul.f32 f669, f663, f1249;
fma.rn.f32 f670, f665, f572, f669;
mul.f32 f1222, f612, f663;
mul.f32 f1223, f613, f665;
sub.f32 f673, f1222, f1223;
mul.f32 f674, f612, f665;
fma.rn.f32 f675, f613, f663, f674;
mul.f32 f1220, f673, f576;
mul.f32 f1221, f675, f1248;
sub.f32 f678, f1220, f1221;
mul.f32 f679, f673, f1248;
fma.rn.f32 f680, f675, f576, f679;
mul.f32 f1218, f612, f673;
mul.f32 f1219, f613, f675;
sub.f32 f683, f1218, f1219;
mul.f32 f684, f612, f675;
fma.rn.f32 f685, f613, f673, f684;
mul.f32 f687, f685, f1247;
mul.f32 f1217, f683, f580;
sub.f32 f688, f1217, f687;
mul.f32 f689, f683, f1247;
fma.rn.f32 f690, f685, f580, f689;
mul.f32 f692, f613, f685;
mul.f32 f1216, f612, f683;
sub.f32 f693, f1216, f692;
mul.f32 f694, f612, f685;
fma.rn.f32 f695, f613, f683, f694;
mul.f32 f697, f695, f1246;
mul.f32 f1215, f693, f584;
sub.f32 f698, f1215, f697;
mul.f32 f699, f693, f1246;
fma.rn.f32 f700, f695, f584, f699;
mul.f32 f702, f613, f695;
mul.f32 f1214, f612, f693;
sub.f32 f703, f1214, f702;
mul.f32 f704, f612, f695;
fma.rn.f32 f705, f613, f693, f704;
mul.f32 f707, f705, f1245;
mul.f32 f1213, f703, f588;
sub.f32 f708, f1213, f707;
mul.f32 f709, f703, f1245;
fma.rn.f32 f710, f705, f588, f709;
mul.f32 f1211, f612, f703;
mul.f32 f1212, f613, f705;
sub.f32 f713, f1211, f1212;
mul.f32 f714, f612, f705;
fma.rn.f32 f715, f613, f703, f714;
mul.f32 f1209, f713, f592;
mul.f32 f1210, f715, f1244;
sub.f32 f718, f1209, f1210;
mul.f32 f719, f713, f1244;
fma.rn.f32 f720, f715, f592, f719;
mul.f32 f1207, f612, f713;
mul.f32 f1208, f613, f715;
sub.f32 f723, f1207, f1208;
mul.f32 f724, f612, f715;
fma.rn.f32 f725, f613, f713, f724;
mul.f32 f1205, f723, f596;
mul.f32 f1206, f725, f1243;
sub.f32 f728, f1205, f1206;
mul.f32 f729, f723, f1243;
fma.rn.f32 f730, f725, f596, f729;
mul.f32 f732, f613, f725;
mul.f32 f1204, f612, f723;
sub.f32 f733, f1204, f732;
mul.f32 f734, f612, f725;
fma.rn.f32 f735, f613, f723, f734;
mul.f32 f737, f735, f1242;
mul.f32 f1203, f733, f600;
sub.f32 f738, f1203, f737;
mul.f32 f739, f733, f1242;
fma.rn.f32 f740, f735, f600, f739;
mul.f32 f742, f613, f735;
mul.f32 f1202, f612, f733;
sub.f32 f743, f1202, f742;
mul.f32 f744, f612, f735;
fma.rn.f32 f745, f613, f733, f744;
mul.f32 f747, f745, f1241;
mul.f32 f1201, f743, f604;
sub.f32 f748, f1201, f747;
mul.f32 f749, f743, f1241;
fma.rn.f32 f750, f745, f604, f749;
mul.f32 f752, f613, f745;
mul.f32 f1200, f612, f743;
sub.f32 f753, f1200, f752;
mul.f32 f754, f612, f745;
fma.rn.f32 f755, f613, f743, f754;
mul.f32 f757, f755, f1240;
mul.f32 f1199, f753, f608;
sub.f32 f758, f1199, f757;
mul.f32 f759, f753, f1240;
fma.rn.f32 f760, f755, f608, f759;
mul.f32 f1197, f612, f753;
mul.f32 f1198, f613, f755;
sub.f32 f763, f1197, f1198;
mul.f32 f764, f612, f755;
fma.rn.f32 f765, f613, f753, f764;
mul.f32 f1195, f763, f550;
mul.f32 f1196, f765, f551;
sub.f32 f768, f1195, f1196;
mul.f32 f769, f763, f551;
fma.rn.f32 f770, f765, f550, f769;
mul.f32 f1193, f612, f763;
mul.f32 f1194, f613, f765;
sub.f32 f773, f1193, f1194;
mul.f32 f774, f612, f765;
fma.rn.f32 f775, f613, f763, f774;
mul.f32 f777, f775, f555;
mul.f32 f1192, f773, f554;
sub.f32 f778, f1192, f777;
mul.f32 f779, f773, f555;
fma.rn.f32 f780, f775, f554, f779;
mul.f32 f782, f613, f775;
mul.f32 f1191, f612, f773;
sub.f32 f783, f1191, f782;
mul.f32 f784, f612, f775;
fma.rn.f32 f785, f613, f773, f784;
mul.f32 f787, f785, f559;
mul.f32 f1190, f783, f558;
sub.f32 f788, f1190, f787;
mul.f32 f789, f783, f559;
fma.rn.f32 f790, f785, f558, f789;
mul.f32 f792, f613, f785;
mul.f32 f1189, f612, f783;
sub.f32 f793, f1189, f792;
mul.f32 f794, f612, f785;
fma.rn.f32 f795, f613, f783, f794;
mul.f32 f797, f795, f563;
mul.f32 f1188, f793, f562;
sub.f32 f798, f1188, f797;
mul.f32 f799, f793, f563;
fma.rn.f32 f800, f795, f562, f799;
mul.f32 f802, f613, f795;
mul.f32 f1187, f612, f793;
sub.f32 f803, f1187, f802;
mul.f32 f804, f612, f795;
fma.rn.f32 f805, f613, f793, f804;
mul.f32 f1185, f803, f566;
mul.f32 f1186, f805, f567;
sub.f32 f808, f1185, f1186;
mul.f32 f809, f803, f567;
fma.rn.f32 f810, f805, f566, f809;
mul.f32 f1183, f612, f803;
mul.f32 f1184, f613, f805;
sub.f32 f813, f1183, f1184;
mul.f32 f814, f612, f805;
fma.rn.f32 f815, f613, f803, f814;
mul.f32 f1181, f813, f570;
mul.f32 f1182, f815, f571;
sub.f32 f818, f1181, f1182;
mul.f32 f819, f813, f571;
fma.rn.f32 f820, f815, f570, f819;
mul.f32 f1179, f612, f813;
mul.f32 f1180, f613, f815;
sub.f32 f823, f1179, f1180;
mul.f32 f824, f612, f815;
fma.rn.f32 f825, f613, f813, f824;
mul.f32 f827, f825, f575;
mul.f32 f1178, f823, f574;
sub.f32 f828, f1178, f827;
mul.f32 f829, f823, f575;
fma.rn.f32 f830, f825, f574, f829;
mul.f32 f832, f613, f825;
mul.f32 f1177, f612, f823;
sub.f32 f833, f1177, f832;
mul.f32 f834, f612, f825;
fma.rn.f32 f835, f613, f823, f834;
mul.f32 f837, f835, f579;
mul.f32 f1176, f833, f578;
sub.f32 f838, f1176, f837;
mul.f32 f839, f833, f579;
fma.rn.f32 f840, f835, f578, f839;
mul.f32 f842, f613, f835;
mul.f32 f1175, f612, f833;
sub.f32 f843, f1175, f842;
mul.f32 f844, f612, f835;
fma.rn.f32 f845, f613, f833, f844;
mul.f32 f847, f845, f583;
mul.f32 f1174, f843, f582;
sub.f32 f848, f1174, f847;
mul.f32 f849, f843, f583;
fma.rn.f32 f850, f845, f582, f849;
mul.f32 f1172, f612, f843;
mul.f32 f1173, f613, f845;
sub.f32 f853, f1172, f1173;
mul.f32 f854, f612, f845;
fma.rn.f32 f855, f613, f843, f854;
mul.f32 f1170, f853, f586;
mul.f32 f1171, f855, f587;
sub.f32 f858, f1170, f1171;
mul.f32 f859, f853, f587;
fma.rn.f32 f860, f855, f586, f859;
mul.f32 f1168, f612, f853;
mul.f32 f1169, f613, f855;
sub.f32 f863, f1168, f1169;
mul.f32 f864, f612, f855;
fma.rn.f32 f865, f613, f853, f864;
mul.f32 f1166, f863, f590;
mul.f32 f1167, f865, f591;
sub.f32 f868, f1166, f1167;
mul.f32 f869, f863, f591;
fma.rn.f32 f870, f865, f590, f869;
mul.f32 f872, f613, f865;
mul.f32 f1165, f612, f863;
sub.f32 f873, f1165, f872;
mul.f32 f874, f612, f865;
fma.rn.f32 f875, f613, f863, f874;
mul.f32 f877, f875, f595;
mul.f32 f1164, f873, f594;
sub.f32 f878, f1164, f877;
mul.f32 f879, f873, f595;
fma.rn.f32 f880, f875, f594, f879;
mul.f32 f882, f613, f875;
mul.f32 f1163, f612, f873;
sub.f32 f883, f1163, f882;
mul.f32 f884, f612, f875;
fma.rn.f32 f885, f613, f873, f884;
mul.f32 f887, f885, f599;
mul.f32 f1162, f883, f598;
sub.f32 f888, f1162, f887;
mul.f32 f889, f883, f599;
fma.rn.f32 f890, f885, f598, f889;
mul.f32 f892, f613, f885;
mul.f32 f1161, f612, f883;
sub.f32 f893, f1161, f892;
mul.f32 f894, f612, f885;
fma.rn.f32 f895, f613, f883, f894;
mul.f32 f1159, f893, f602;
mul.f32 f1160, f895, f603;
sub.f32 f898, f1159, f1160;
mul.f32 f899, f893, f603;
fma.rn.f32 f900, f895, f602, f899;
mul.f32 f1157, f612, f893;
mul.f32 f1158, f613, f895;
sub.f32 f903, f1157, f1158;
mul.f32 f904, f612, f895;
fma.rn.f32 f905, f613, f893, f904;
mul.f32 f1155, f903, f606;
mul.f32 f1156, f905, f607;
sub.f32 f908, f1155, f1156;
mul.f32 f909, f903, f607;
fma.rn.f32 f910, f905, f606, f909;
mul.f32 f1153, f612, f903;
mul.f32 f1154, f613, f905;
sub.f32 f913, f1153, f1154;
mul.f32 f914, f612, f905;
fma.rn.f32 f915, f613, f903, f914;
mul.f32 f917, f915, f611;
mul.f32 f1152, f913, f610;
sub.f32 f918, f1152, f917;
mov.u32 r17, %tid.x;
mul.f32 f919, f913, f611;
fma.rn.f32 f920, f915, f610, f919;
and.b32 r14, r17, 3;
shl.b32 r8, r17, 7;
and.b32 r9, r8, -512;
add.s32 r10, r4, r9;
barrier.sync 0;
and.b32 r11, r8, 384;
add.s32 r12, r10, r11;
st.shared.v4.f32 [r12], {f548, f618, f628, f638};
st.shared.v4.f32 [r12+16], {f648, f658, f668, f678};
st.shared.v4.f32 [r12+32], {f688, f698, f708, f718};
st.shared.v4.f32 [r12+48], {f728, f738, f748, f758};
st.shared.v4.f32 [r12+64], {f768, f778, f788, f798};
st.shared.v4.f32 [r12+80], {f808, f818, f828, f838};
st.shared.v4.f32 [r12+96], {f848, f858, f868, f878};
st.shared.v4.f32 [r12+112], {f888, f898, f908, f918};
barrier.sync 0;
mad.lo.s32 r13, r14, -124, r12;
ld.shared.f32 f921, [r13];
ld.shared.f32 f922, [r13+16];
ld.shared.f32 f923, [r13+32];
ld.shared.f32 f924, [r13+48];
ld.shared.f32 f925, [r13+64];
ld.shared.f32 f926, [r13+80];
ld.shared.f32 f927, [r13+96];
ld.shared.f32 f928, [r13+112];
ld.shared.f32 f929, [r13+128];
ld.shared.f32 f930, [r13+144];
ld.shared.f32 f931, [r13+160];
ld.shared.f32 f932, [r13+176];
ld.shared.f32 f933, [r13+192];
ld.shared.f32 f934, [r13+208];
ld.shared.f32 f935, [r13+224];
ld.shared.f32 f936, [r13+240];
ld.shared.f32 f937, [r13+256];
ld.shared.f32 f938, [r13+272];
ld.shared.f32 f939, [r13+288];
ld.shared.f32 f940, [r13+304];
ld.shared.f32 f941, [r13+320];
ld.shared.f32 f942, [r13+336];
ld.shared.f32 f943, [r13+352];
ld.shared.f32 f944, [r13+368];
ld.shared.f32 f945, [r13+384];
ld.shared.f32 f946, [r13+400];
ld.shared.f32 f947, [r13+416];
ld.shared.f32 f948, [r13+432];
ld.shared.f32 f949, [r13+448];
ld.shared.f32 f950, [r13+464];
ld.shared.f32 f951, [r13+480];
ld.shared.f32 f952, [r13+496];
barrier.sync 0;
st.shared.v4.f32 [r12], {f1255, f620, f630, f640};
st.shared.v4.f32 [r12+16], {f650, f660, f670, f680};
st.shared.v4.f32 [r12+32], {f690, f700, f710, f720};
st.shared.v4.f32 [r12+48], {f730, f740, f750, f760};
st.shared.v4.f32 [r12+64], {f770, f780, f790, f800};
st.shared.v4.f32 [r12+80], {f810, f820, f830, f840};
st.shared.v4.f32 [r12+96], {f850, f860, f870, f880};
st.shared.v4.f32 [r12+112], {f890, f900, f910, f920};
barrier.sync 0;
ld.shared.f32 f953, [r13];
ld.shared.f32 f954, [r13+16];
ld.shared.f32 f955, [r13+32];
ld.shared.f32 f956, [r13+48];
ld.shared.f32 f957, [r13+64];
ld.shared.f32 f958, [r13+80];
ld.shared.f32 f959, [r13+96];
ld.shared.f32 f960, [r13+112];
ld.shared.f32 f961, [r13+128];
ld.shared.f32 f962, [r13+144];
ld.shared.f32 f963, [r13+160];
ld.shared.f32 f964, [r13+176];
ld.shared.f32 f965, [r13+192];
ld.shared.f32 f966, [r13+208];
ld.shared.f32 f967, [r13+224];
ld.shared.f32 f968, [r13+240];
ld.shared.f32 f969, [r13+256];
ld.shared.f32 f970, [r13+272];
ld.shared.f32 f971, [r13+288];
ld.shared.f32 f972, [r13+304];
ld.shared.f32 f973, [r13+320];
ld.shared.f32 f974, [r13+336];
ld.shared.f32 f975, [r13+352];
ld.shared.f32 f976, [r13+368];
ld.shared.f32 f977, [r13+384];
ld.shared.f32 f978, [r13+400];
ld.shared.f32 f979, [r13+416];
ld.shared.f32 f980, [r13+432];
ld.shared.f32 f981, [r13+448];
ld.shared.f32 f982, [r13+464];
ld.shared.f32 f983, [r13+480];
ld.shared.f32 f984, [r13+496];
add.f32 f985, f921, f937;
sub.f32 f987, f921, f937;
add.f32 f1151, f953, f969;
sub.f32 f988, f953, f969;
add.f32 f989, f929, f945;
sub.f32 f991, f929, f945;
add.f32 f1150, f961, f977;
sub.f32 f992, f961, f977;
add.f32 f993, f922, f938;
sub.f32 f995, f922, f938;
add.f32 f1149, f954, f970;
sub.f32 f996, f954, f970;
add.f32 f997, f930, f946;
sub.f32 f999, f930, f946;
add.f32 f1148, f962, f978;
sub.f32 f1000, f962, f978;
add.f32 f1001, f923, f939;
sub.f32 f1003, f923, f939;
add.f32 f1147, f955, f971;
sub.f32 f1004, f955, f971;
add.f32 f1005, f931, f947;
sub.f32 f1007, f931, f947;
add.f32 f1146, f963, f979;
sub.f32 f1008, f963, f979;
add.f32 f1009, f924, f940;
sub.f32 f1011, f924, f940;
add.f32 f1145, f956, f972;
sub.f32 f1012, f956, f972;
add.f32 f1013, f932, f948;
sub.f32 f1015, f932, f948;
add.f32 f1144, f964, f980;
sub.f32 f1016, f964, f980;
add.f32 f1017, f925, f941;
sub.f32 f1019, f925, f941;
add.f32 f1143, f957, f973;
sub.f32 f1020, f957, f973;
add.f32 f1021, f933, f949;
sub.f32 f1023, f933, f949;
add.f32 f1142, f965, f981;
sub.f32 f1024, f965, f981;
add.f32 f1025, f926, f942;
sub.f32 f1027, f926, f942;
add.f32 f1141, f958, f974;
sub.f32 f1028, f958, f974;
add.f32 f1029, f934, f950;
sub.f32 f1031, f934, f950;
add.f32 f1140, f966, f982;
sub.f32 f1032, f966, f982;
add.f32 f1033, f927, f943;
sub.f32 f1035, f927, f943;
add.f32 f1139, f959, f975;
sub.f32 f1036, f959, f975;
add.f32 f1037, f935, f951;
sub.f32 f1039, f935, f951;
add.f32 f1138, f967, f983;
sub.f32 f1040, f967, f983;
add.f32 f1041, f928, f944;
sub.f32 f1043, f928, f944;
add.f32 f1137, f960, f976;
sub.f32 f1044, f960, f976;
add.f32 f1045, f936, f952;
sub.f32 f1047, f936, f952;
add.f32 f1136, f968, f984;
sub.f32 f1048, f968, f984;
add.f32 %0, f985, f989;
add.f32 %1, f1151, f1150;
add.f32 %3, f1149, f1148;
add.f32 %2, f993, f997;
add.f32 %5, f1147, f1146;
add.f32 %4, f1001, f1005;
add.f32 %7, f1145, f1144;
add.f32 %6, f1009, f1013;
add.f32 %9, f1143, f1142;
add.f32 %8, f1017, f1021;
add.f32 %10, f1025, f1029;
add.f32 %11, f1141, f1140;
add.f32 %12, f1033, f1037;
add.f32 %13, f1139, f1138;
add.f32 %14, f1041, f1045;
add.f32 %15, f1137, f1136;
add.f32 %16, f987, f992;
sub.f32 %17, f988, f991;
add.f32 %18, f995, f1000;
sub.f32 %19, f996, f999;
add.f32 %20, f1003, f1008;
sub.f32 %21, f1004, f1007;
sub.f32 %23, f1012, f1015;
add.f32 %22, f1011, f1016;
sub.f32 %25, f1020, f1023;
add.f32 %24, f1019, f1024;
sub.f32 %27, f1028, f1031;
add.f32 %26, f1027, f1032;
add.f32 %28, f1035, f1040;
sub.f32 %29, f1036, f1039;
add.f32 %30, f1043, f1048;
sub.f32 %31, f1044, f1047;
sub.f32 %32, f985, f989;
sub.f32 %33, f1151, f1150;
sub.f32 %34, f993, f997;
sub.f32 %35, f1149, f1148;
sub.f32 %36, f1001, f1005;
sub.f32 %37, f1147, f1146;
sub.f32 %38, f1009, f1013;
sub.f32 %39, f1145, f1144;
sub.f32 %40, f1017, f1021;
sub.f32 %41, f1143, f1142;
sub.f32 %42, f1025, f1029;
sub.f32 %43, f1141, f1140;
sub.f32 %44, f1033, f1037;
sub.f32 %45, f1139, f1138;
sub.f32 %46, f1041, f1045;
sub.f32 %47, f1137, f1136;
add.f32 %49, f988, f991;
sub.f32 %48, f987, f992;
add.f32 %51, f996, f999;
sub.f32 %50, f995, f1000;
add.f32 %53, f1004, f1007;
sub.f32 %52, f1003, f1008;
add.f32 %55, f1012, f1015;
sub.f32 %54, f1011, f1016;
add.f32 %57, f1020, f1023;
sub.f32 %56, f1019, f1024;
add.f32 %59, f1028, f1031;
sub.f32 %58, f1027, f1032;
add.f32 %61, f1036, f1039;
sub.f32 %60, f1035, f1040;
add.f32 %63, f1044, f1047;
sub.f32 %62, f1043, f1048;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_128), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<59, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<139>;
.reg .b32 r<49>;
.reg .b64 rd<21>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 10;
mov.u32 r3, %4;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
sub.f32 f9, %11, %13;
sub.f32 f10, %12, %14;
shl.b32 r6, r5, 4;
and.b32 r7, r6, -1024;
add.s32 r8, r4, r7;
shl.b32 r9, r5, 3;
cvt.u64.u32 rd2, r9;
and.b64 rd3, rd2, 504;
mov.u64 rd4, %5;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f11, f12}, [rd5];
mul.f32 f15, f11, f9;
mul.f32 f16, f12, f10;
mul.f32 f17, f11, f10;
barrier.sync 0;
and.b32 r10, r6, 1008;
add.s32 r11, r8, r10;
add.f32 f18, %12, %14;
add.f32 f19, %11, %13;
st.shared.v2.f32 [r11], {f19, f18};
sub.f32 f20, f15, f16;
fma.rn.f32 f21, f12, f9, f17;
st.shared.v2.f32 [r11+8], {f20, f21};
barrier.sync 0;
and.b32 r12, r9, 504;
sub.s32 r13, r11, r12;
ld.shared.v2.f32 {f22, f23}, [r13];
ld.shared.v2.f32 {f26, f27}, [r13+512];
sub.f32 f30, f22, f26;
sub.f32 f31, f23, f27;
bfe.u32 r14, r5, 1, 5;
mul.wide.u32 rd6, r14, 8;
mov.u64 rd7, %6;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f32, f33}, [rd8];
mul.f32 f36, f32, f30;
mul.f32 f37, f33, f31;
mul.f32 f38, f32, f31;
and.b32 r15, r9, 8;
add.s32 r16, r8, r15;
barrier.sync 0;
and.b32 r17, r6, 992;
add.s32 r18, r16, r17;
add.f32 f39, f23, f27;
add.f32 f40, f22, f26;
st.shared.v2.f32 [r18], {f40, f39};
fma.rn.f32 f41, f33, f30, f38;
sub.f32 f42, f36, f37;
st.shared.v2.f32 [r18+16], {f42, f41};
barrier.sync 0;
and.b32 r19, r9, 496;
sub.s32 r20, r18, r19;
ld.shared.v2.f32 {f43, f44}, [r20];
ld.shared.v2.f32 {f47, f48}, [r20+512];
sub.f32 f51, f43, f47;
sub.f32 f52, f44, f48;
bfe.u32 r21, r5, 2, 4;
mul.wide.u32 rd9, r21, 8;
mov.u64 rd10, %7;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f53, f54}, [rd11];
mul.f32 f57, f53, f51;
mul.f32 f58, f54, f52;
mul.f32 f59, f53, f52;
and.b32 r22, r9, 24;
add.s32 r23, r8, r22;
barrier.sync 0;
and.b32 r24, r6, 960;
add.s32 r25, r23, r24;
add.f32 f60, f44, f48;
add.f32 f61, f43, f47;
st.shared.v2.f32 [r25], {f61, f60};
fma.rn.f32 f62, f54, f51, f59;
sub.f32 f63, f57, f58;
st.shared.v2.f32 [r25+32], {f63, f62};
barrier.sync 0;
and.b32 r26, r9, 480;
sub.s32 r27, r25, r26;
ld.shared.v2.f32 {f64, f65}, [r27];
ld.shared.v2.f32 {f68, f69}, [r27+512];
sub.f32 f72, f64, f68;
sub.f32 f73, f65, f69;
and.b32 r28, r5, 56;
cvt.u64.u32 rd12, r28;
mov.u64 rd13, %8;
add.s64 rd14, rd13, rd12;
ld.global.v2.f32 {f74, f75}, [rd14];
mul.f32 f78, f74, f72;
mul.f32 f79, f75, f73;
mul.f32 f80, f74, f73;
and.b32 r29, r9, 56;
add.s32 r30, r8, r29;
barrier.sync 0;
and.b32 r31, r6, 896;
add.s32 r32, r30, r31;
add.f32 f81, f65, f69;
add.f32 f82, f64, f68;
st.shared.v2.f32 [r32], {f82, f81};
fma.rn.f32 f83, f75, f72, f80;
sub.f32 f84, f78, f79;
st.shared.v2.f32 [r32+64], {f84, f83};
barrier.sync 0;
and.b32 r33, r9, 448;
sub.s32 r34, r32, r33;
ld.shared.v2.f32 {f85, f86}, [r34];
ld.shared.v2.f32 {f89, f90}, [r34+512];
sub.f32 f93, f85, f89;
sub.f32 f94, f86, f90;
bfe.u32 r35, r5, 4, 2;
mul.wide.u32 rd15, r35, 8;
mov.u64 rd16, %9;
add.s64 rd17, rd16, rd15;
ld.global.v2.f32 {f95, f96}, [rd17];
mul.f32 f99, f95, f93;
mul.f32 f100, f96, f94;
mul.f32 f101, f95, f94;
and.b32 r36, r9, 120;
add.s32 r37, r8, r36;
barrier.sync 0;
and.b32 r38, r6, 768;
add.s32 r39, r37, r38;
add.f32 f102, f86, f90;
add.f32 f103, f85, f89;
st.shared.v2.f32 [r39], {f103, f102};
fma.rn.f32 f104, f96, f93, f101;
sub.f32 f105, f99, f100;
st.shared.v2.f32 [r39+128], {f105, f104};
barrier.sync 0;
and.b32 r40, r9, 384;
sub.s32 r41, r39, r40;
ld.shared.v2.f32 {f106, f107}, [r41];
ld.shared.v2.f32 {f110, f111}, [r41+512];
sub.f32 f114, f106, f110;
sub.f32 f115, f107, f111;
bfe.u32 r42, r5, 5, 1;
mul.wide.u32 rd18, r42, 8;
mov.u64 rd19, %10;
add.s64 rd20, rd19, rd18;
ld.global.v2.f32 {f116, f117}, [rd20];
mul.f32 f120, f116, f114;
mul.f32 f121, f117, f115;
mul.f32 f122, f116, f115;
and.b32 r43, r9, 248;
add.s32 r44, r8, r43;
barrier.sync 0;
and.b32 r45, r6, 512;
add.s32 r46, r44, r45;
add.f32 f123, f107, f111;
add.f32 f124, f106, f110;
st.shared.v2.f32 [r46], {f124, f123};
fma.rn.f32 f125, f117, f114, f122;
sub.f32 f126, f120, f121;
st.shared.v2.f32 [r46+256], {f126, f125};
barrier.sync 0;
and.b32 r47, r9, 256;
sub.s32 r48, r46, r47;
ld.shared.v2.f32 {f127, f128}, [r48];
ld.shared.v2.f32 {f131, f132}, [r48+512];
add.f32 %1, f128, f132;
add.f32 %0, f127, f131;
sub.f32 %3, f128, f132;
sub.f32 %2, f127, f131;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<60, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<115>;
.reg .b32 r<49>;
.reg .b64 rd<21>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 9;
mov.u32 r3, %4;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f32 f9, %11, %13;
add.f32 f10, %12, %14;
sub.f32 f11, %11, %13;
sub.f32 f12, %12, %14;
shl.b32 r6, r5, 3;
cvt.u64.u32 rd2, r6;
and.b64 rd3, rd2, 504;
mov.u64 rd4, %5;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f13, f14}, [rd5];
mul.f32 f17, f13, f11;
mul.f32 f18, f14, f12;
sub.f32 f19, f17, f18;
mul.f32 f20, f13, f12;
fma.rn.f32 f21, f14, f11, f20;
and.b32 r7, r6, -512;
add.s32 r8, r4, r7;
barrier.sync 0;
and.b32 r9, r6, 504;
add.s32 r10, r8, r9;
st.shared.v2.f32 [r10], {f9, f19};
barrier.sync 0;
shl.b32 r11, r5, 2;
and.b32 r12, r11, 252;
sub.s32 r13, r10, r12;
ld.shared.f32 f22, [r13];
ld.shared.f32 f23, [r13+256];
barrier.sync 0;
st.shared.v2.f32 [r10], {f10, f21};
barrier.sync 0;
ld.shared.f32 f24, [r13];
ld.shared.f32 f25, [r13+256];
add.f32 f26, f22, f23;
add.f32 f27, f24, f25;
sub.f32 f28, f22, f23;
sub.f32 f29, f24, f25;
bfe.u32 r14, r5, 1, 5;
mul.wide.u32 rd6, r14, 8;
mov.u64 rd7, %6;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f30, f31}, [rd8];
mul.f32 f34, f30, f28;
mul.f32 f35, f31, f29;
sub.f32 f36, f34, f35;
mul.f32 f37, f30, f29;
fma.rn.f32 f38, f31, f28, f37;
and.b32 r15, r11, 4;
add.s32 r16, r8, r15;
barrier.sync 0;
and.b32 r17, r6, 496;
add.s32 r18, r16, r17;
st.shared.f32 [r18], f26;
st.shared.f32 [r18+8], f36;
barrier.sync 0;
and.b32 r19, r11, 248;
sub.s32 r20, r18, r19;
ld.shared.f32 f39, [r20];
ld.shared.f32 f40, [r20+256];
barrier.sync 0;
st.shared.f32 [r18], f27;
st.shared.f32 [r18+8], f38;
barrier.sync 0;
ld.shared.f32 f41, [r20];
ld.shared.f32 f42, [r20+256];
add.f32 f43, f39, f40;
add.f32 f44, f41, f42;
sub.f32 f45, f39, f40;
sub.f32 f46, f41, f42;
bfe.u32 r21, r5, 2, 4;
mul.wide.u32 rd9, r21, 8;
mov.u64 rd10, %7;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f47, f48}, [rd11];
mul.f32 f51, f47, f45;
mul.f32 f52, f48, f46;
sub.f32 f53, f51, f52;
mul.f32 f54, f47, f46;
fma.rn.f32 f55, f48, f45, f54;
and.b32 r22, r11, 12;
add.s32 r23, r8, r22;
barrier.sync 0;
and.b32 r24, r6, 480;
add.s32 r25, r23, r24;
st.shared.f32 [r25], f43;
st.shared.f32 [r25+16], f53;
barrier.sync 0;
and.b32 r26, r11, 240;
sub.s32 r27, r25, r26;
ld.shared.f32 f56, [r27];
ld.shared.f32 f57, [r27+256];
barrier.sync 0;
st.shared.f32 [r25], f44;
st.shared.f32 [r25+16], f55;
barrier.sync 0;
ld.shared.f32 f58, [r27];
ld.shared.f32 f59, [r27+256];
add.f32 f60, f56, f57;
add.f32 f61, f58, f59;
sub.f32 f62, f56, f57;
sub.f32 f63, f58, f59;
and.b32 r28, r5, 56;
cvt.u64.u32 rd12, r28;
mov.u64 rd13, %8;
add.s64 rd14, rd13, rd12;
ld.global.v2.f32 {f64, f65}, [rd14];
mul.f32 f68, f64, f62;
mul.f32 f69, f65, f63;
sub.f32 f70, f68, f69;
mul.f32 f71, f64, f63;
fma.rn.f32 f72, f65, f62, f71;
and.b32 r29, r11, 28;
add.s32 r30, r8, r29;
barrier.sync 0;
and.b32 r31, r6, 448;
add.s32 r32, r30, r31;
st.shared.f32 [r32], f60;
st.shared.f32 [r32+32], f70;
barrier.sync 0;
and.b32 r33, r11, 224;
sub.s32 r34, r32, r33;
ld.shared.f32 f73, [r34];
ld.shared.f32 f74, [r34+256];
barrier.sync 0;
st.shared.f32 [r32], f61;
st.shared.f32 [r32+32], f72;
barrier.sync 0;
ld.shared.f32 f75, [r34];
ld.shared.f32 f76, [r34+256];
add.f32 f77, f73, f74;
add.f32 f78, f75, f76;
sub.f32 f79, f73, f74;
sub.f32 f80, f75, f76;
bfe.u32 r35, r5, 4, 2;
mul.wide.u32 rd15, r35, 8;
mov.u64 rd16, %9;
add.s64 rd17, rd16, rd15;
ld.global.v2.f32 {f81, f82}, [rd17];
mul.f32 f85, f81, f79;
mul.f32 f86, f82, f80;
sub.f32 f87, f85, f86;
mul.f32 f88, f81, f80;
fma.rn.f32 f89, f82, f79, f88;
and.b32 r36, r11, 60;
add.s32 r37, r8, r36;
barrier.sync 0;
and.b32 r38, r6, 384;
add.s32 r39, r37, r38;
st.shared.f32 [r39], f77;
st.shared.f32 [r39+64], f87;
barrier.sync 0;
and.b32 r40, r11, 192;
sub.s32 r41, r39, r40;
ld.shared.f32 f90, [r41];
ld.shared.f32 f91, [r41+256];
barrier.sync 0;
st.shared.f32 [r39], f78;
st.shared.f32 [r39+64], f89;
barrier.sync 0;
ld.shared.f32 f92, [r41];
ld.shared.f32 f93, [r41+256];
add.f32 f94, f90, f91;
add.f32 f95, f92, f93;
sub.f32 f96, f90, f91;
sub.f32 f97, f92, f93;
bfe.u32 r42, r5, 5, 1;
mul.wide.u32 rd18, r42, 8;
mov.u64 rd19, %10;
add.s64 rd20, rd19, rd18;
ld.global.v2.f32 {f98, f99}, [rd20];
mul.f32 f102, f98, f96;
mul.f32 f103, f99, f97;
sub.f32 f104, f102, f103;
mul.f32 f105, f98, f97;
fma.rn.f32 f106, f99, f96, f105;
and.b32 r43, r11, 124;
add.s32 r44, r8, r43;
barrier.sync 0;
and.b32 r45, r6, 256;
add.s32 r46, r44, r45;
st.shared.f32 [r46], f94;
st.shared.f32 [r46+128], f104;
barrier.sync 0;
and.b32 r47, r11, 128;
sub.s32 r48, r46, r47;
ld.shared.f32 f107, [r48];
ld.shared.f32 f108, [r48+256];
barrier.sync 0;
st.shared.f32 [r46], f95;
st.shared.f32 [r46+128], f106;
barrier.sync 0;
ld.shared.f32 f109, [r48];
ld.shared.f32 f110, [r48+256];
add.f32 %0, f107, f108;
add.f32 %1, f109, f110;
sub.f32 %2, f107, f108;
sub.f32 %3, f109, f110;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y));
};


#endif
