diff --git a/docs/examples/te_gemma/media/calibration.svg b/docs/examples/te_gemma/media/calibration.svg
new file mode 100644
index 0000000000..b1e1b5ae4b
--- /dev/null
+++ b/docs/examples/te_gemma/media/calibration.svg
@@ -0,0 +1 @@
+<svg width="1280" height="720" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="1280" height="720" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="24" transform="translate(39.6169 204)">FP8 with initial scaling factors</text><rect x="25" y="326" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="40" y="351" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(76.8203 374)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(63.067 390)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(70.067 406)">weight</text><rect x="40" y="433" width="103" height="48" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#F7CBCB"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(74.3203 445)">Initial</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(55.7337 461)">FP8 scaling</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(68.6536 477)">factors</text><rect x="183" y="363" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(212.27 382)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(201.77 398)">Weight</text><rect x="288" y="307" width="82" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(316.622 325)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(312.202 341)">Input</text><rect x="277" y="224" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(314.289 247)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(300.535 263)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(312.455 279)">input</text><rect x="288" y="363" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(316.619 382)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(307.952 398)">GEMM</text><path d="M0.015735-0.999876 34.0184-0.464776 33.987 1.53498-0.015735 0.999876ZM32.7325-3.48538 40.6686 0.64 32.6066 4.51362Z" transform="matrix(1 0 0 -1 143 386.64)"/><path d="M265 385 280.791 385 280.791 387 265 387ZM279.458 382 287.458 386 279.458 390Z"/><path d="M330 351 330 356.349 328 356.349 328 351ZM333 355.016 329 363.016 325 355.016Z"/><path d="M330 295 330 300.349 328 300.349 328 295ZM333 299.016 329 307.016 325 299.016Z"/><path d="M246.452 367 250.312 372.911 248.72 373.608 253.582 378.673 251.989 379.519 258 388 247.807 381.501 249.751 380.598 243.417 376.435 245.687 375.149 239 370.782Z" fill="#FF0000" fill-rule="evenodd"/><path d="M351.844 310 355.907 315.63 354.231 316.294 359.349 321.118 357.673 321.923 364 330 353.27 323.81 355.317 322.951 348.65 318.986 351.039 317.761 344 313.602Z" fill="#FF0000" fill-rule="evenodd"/><path d="M353.452 367 357.312 372.63 355.72 373.294 360.582 378.118 358.99 378.923 365 387 354.807 380.81 356.751 379.951 350.418 375.986 352.687 374.761 346 370.602Z" fill="#FF0000" fill-rule="evenodd"/><path d="M0.0369111-0.999319 6.03282-0.777852 5.959 1.22079-0.0369111 0.999319ZM8.10061-0.673792 14.0656-0.0265058 13.8498 1.96182 7.88485 1.31454ZM16.1141 0.247765 22.0214 1.2984 21.6712 3.2675 15.7639 2.21686ZM24.0392 1.73603 29.8647 3.17232 29.386 5.11418 23.5604 3.67789ZM31.84 3.77104 37.3437 5.51189 37.6161 5.61786 36.8909 7.48175 36.6487 7.38751 36.7097 7.40901 31.2368 5.67792ZM39.48 6.34307 44.1494 8.15988 45.1058 8.60687 44.259 10.4188 43.3326 9.98577 43.3934 10.0118 38.7547 8.20696ZM46.9177 9.45367 50.5967 11.1731 52.3529 12.1468 51.3831 13.896 49.6571 12.939 49.7186 12.9703 46.0709 11.2656ZM54.1021 13.1167 56.6269 14.5166 59.2925 16.2633 58.1963 17.9361 55.5616 16.2097 55.6248 16.2478 53.1322 14.8658ZM60.9653 17.3595 62.181 18.1561 65.8342 20.9962 64.6067 22.5752 60.9853 19.7599 61.051 19.8068 59.8691 19.0323ZM67.4635 22.304 71.6245 26.1896 71.8939 26.499 70.3854 27.8122 70.15 27.5418 70.2217 27.6161 66.0985 23.7658ZM73.2071 28.0075 75.3941 30.5196 76.9604 32.8279 75.3055 33.9509 73.7729 31.6924 73.8461 31.7875 71.6986 29.3207ZM78.0834 34.4828 78.4463 35.0176 80.7274 39.6775 80.8507 40.1892 78.9063 40.6576 78.8087 40.2527 78.8828 40.4582 76.681 35.9603 76.7517 36.0821 76.4285 35.6058ZM81.3191 42.1336 81.384 42.4029 79.4396 42.8713 79.3747 42.602ZM84.1057 40.6042 81.6141 49.1944 76.2369 42.047Z" transform="matrix(1 0 0 -1 143 457.194)"/><path d="M0.0162082-0.999869 6.01542-0.902619 5.983 1.09712-0.0162082 0.999869ZM8.01516-0.870203 14.0144-0.772954 13.9819 1.22678 7.98274 1.12953ZM16.0141-0.740537 17.4432-0.717371 22.0401-0.498449 21.9449 1.49929 17.3637 1.28111 17.3951 1.28211 15.9817 1.2592ZM24.0378-0.403309 30.031-0.117889 29.9359 1.87985 23.9427 1.59443ZM32.0288-0.0227485 34.7491 0.106806 38.046 0.36414 37.8903 2.35808 34.6086 2.10192 34.6389 2.10382 31.9336 1.97499ZM40.0399 0.519777 46.0217 0.986686 45.8661 2.98062 39.8843 2.51371ZM48.0156 1.14232 51.7818 1.43629 54.021 1.67858 53.8059 3.66697 51.5816 3.42629 51.6113 3.42907 47.86 3.13626ZM56.0094 1.89373 61.9746 2.53919 61.7594 4.52758 55.7942 3.88213ZM63.963 2.75434 68.4058 3.23507 69.9524 3.44976 69.6774 5.43077 68.1457 5.21815 68.1756 5.22184 63.7478 4.74273ZM71.9334 3.72476 77.8764 4.54976 77.6014 6.53076 71.6584 5.70577ZM79.8574 4.82475 84.4855 5.46721 85.8248 5.69592 85.4882 7.66738 84.1642 7.4413 84.195 7.44607 79.5824 6.80576ZM87.7963 6.03256 93.7107 7.0425 93.374 9.01396 87.4596 8.00403ZM95.6821 7.37914 99.8855 8.09689 101.618 8.45234 101.217 10.4115 99.4998 10.0594 99.5324 10.0656 95.3455 9.35061ZM103.578 8.85419 109.455 10.0598 109.053 12.019 103.176 10.8134ZM111.414 10.4616 114.471 11.0885 117.305 11.7781 116.832 13.7214 114.015 13.0361 114.051 13.044 111.013 12.4208ZM119.248 12.251 125.078 13.6696 124.605 15.6129 118.775 14.1942ZM127.021 14.1425 128.106 14.4065 132.839 15.7676 132.286 17.6897 127.573 16.3343 127.613 16.3449 126.548 16.0858ZM134.761 16.3204 140.527 17.9787 139.974 19.9008 134.208 18.2425ZM142.468 18.6339 148.147 20.5714 147.501 22.4642 141.822 20.5268ZM150.04 21.2172 151.991 21.883 155.691 23.3992 154.932 25.2498 151.26 23.745 151.317 23.7661 149.394 23.1101ZM157.541 24.1576 161.965 25.9705 163.116 26.5198 162.255 28.3249 161.129 27.7877 161.181 27.8105 156.783 26.0083ZM164.921 27.3811 166.417 28.0949 170.288 30.1608 169.346 31.9252 165.496 29.8699 165.536 29.8903 164.06 29.1861ZM172.051 31.2128 174.133 32.4706 177.115 34.5431 175.973 36.1855 173.017 34.1311 173.071 34.1659 171.017 32.9246ZM178.738 35.8385 180.177 37.0144 182.575 39.3924 183.152 40.3283 181.449 41.3778 180.935 40.5432 181.082 40.7285 178.803 38.4683 178.875 38.5326 177.473 37.3872ZM184.201 42.0309 184.451 42.436 182.748 43.4854 182.499 43.0803ZM186.867 40.296 185.963 49.1944 179.387 43.132Z" transform="matrix(1 0 0 -1 143 457.194)"/><path d="M0.0156214-0.999878 6.01489-0.90615 5.98365 1.09361-0.0156214 0.999878ZM8.01465-0.874907 14.0139-0.781179 13.9827 1.21858 7.9834 1.12485ZM16.0137-0.749936 22.0129-0.656208 21.9817 1.34355 15.9824 1.24982ZM24.0425-0.610275 30.0362-0.33517 29.9445 1.66273 23.9508 1.38762ZM32.0341-0.243468 38.0278 0.0316382 37.9361 2.02953 31.9424 1.75443ZM40.0257 0.12334 46.0194 0.398446 45.9277 2.39634 39.934 2.12124ZM48.0445 0.523202 54.0276 0.973298 53.8776 2.96766 47.8945 2.51757ZM56.022 1.12333 62.0051 1.57343 61.855 3.56779 55.872 3.1177ZM63.9994 1.72346 69.9786 2.17326 70.0112 2.17666 69.8037 4.16587 69.7855 4.16396 69.8142 4.16654 63.8494 3.71782ZM72.0004 2.3841 77.968 3.00644 77.7606 4.99565 71.793 4.37332ZM79.9573 3.21389 85.9249 3.83622 85.7175 5.82544 79.7498 5.2031ZM87.9141 4.04367 92.4444 4.51611 93.9056 4.71161 93.6404 6.69394 92.1936 6.50038 92.2225 6.50382 87.7067 6.03288ZM95.8879 4.97682 101.835 5.77248 101.57 7.75481 95.6227 6.95916ZM103.817 6.03769 109.764 6.83334 109.499 8.81568 103.552 8.02003ZM111.747 7.09856 114.174 7.42333 117.708 8.00485 117.383 9.97831 113.864 9.39923 113.894 9.40367 111.481 9.0809ZM119.681 8.32963 125.601 9.30397 125.277 11.2774 119.356 10.3031ZM127.575 9.62875 133.495 10.6031 133.17 12.5765 127.25 11.6022ZM135.497 10.9496 141.383 12.1131 140.996 14.0752 135.109 12.9116ZM143.345 12.501 149.232 13.6646 148.844 15.6266 142.958 14.463ZM151.194 14.0524 154.692 14.744 157.096 15.3078 156.639 17.255 154.252 16.6952 154.287 16.7026 150.806 16.0145ZM159.043 15.7645 164.885 17.1344 164.428 19.0816 158.586 17.7116ZM166.832 17.5911 172.673 18.9611 172.217 20.9083 166.375 19.5383ZM174.638 19.4697 180.436 21.011 179.923 22.9439 174.124 21.4025ZM182.386 21.5434 188.15 23.2095 187.595 25.1309 181.831 23.4647ZM190.094 23.7721 195.817 25.5738 195.217 27.4815 189.494 25.6798ZM197.725 26.1743 197.937 26.241 203.426 28.1274 202.776 30.0188 197.299 28.1367 197.324 28.1448 197.125 28.082ZM205.317 28.7775 205.372 28.7962 210.96 30.9008 210.255 32.7724 204.681 30.673 204.708 30.6829 204.667 30.6689ZM212.856 31.6341 218.398 33.9327 217.632 35.7801 212.09 33.4815ZM220.257 34.7672 224.842 36.8757 225.729 37.3319 224.814 39.1104 223.947 38.6643 223.987 38.6836 219.421 36.5843ZM227.508 38.2468 230.314 39.6905 232.817 41.1479 231.81 42.8763 229.33 41.432 229.376 41.457 226.593 40.0253ZM234.545 42.1544 235.24 42.5592 239.599 45.4789 239.669 45.5339 238.432 47.1052 238.392 47.074 238.454 47.1192 234.153 44.2384 234.207 44.2717 233.539 43.8827ZM241.24 46.7714 243.37 48.4484 245.817 50.7879 244.435 52.2335 242.022 49.9271 242.095 49.9899 240.003 48.3426ZM247.249 52.3466 249.052 54.5432 250.739 57.3923 249.018 58.4114 247.37 55.6281 247.457 55.7529 245.703 53.6153ZM251.537 59.4264 252.056 60.8708 252.444 64.0697 252.369 65.7033 250.371 65.6112 250.443 64.0611 250.449 64.2276 250.085 61.2235 250.136 61.4415 249.655 60.1032ZM252.277 67.7012 252.001 73.6948 250.003 73.6028 250.279 67.6091ZM251.909 75.6927 251.888 76.1544 251.138 81.7297 249.156 81.4632 249.9 75.9313 249.892 76.0185 249.911 75.6006ZM250.871 83.7119 250.319 87.8223 249.896 89.7144 247.944 89.2783 248.357 87.4283 248.342 87.5131 248.889 83.4453ZM249.46 91.6663 248.152 97.5219 246.2 97.0858 247.508 91.2302ZM247.645 99.5265 246.377 103.802 245.848 105.302 243.962 104.638 244.482 103.161 244.466 103.209 245.728 98.9575ZM245.184 107.189 244.709 108.539 242.933 112.808 241.086 112.04 242.852 107.796 242.832 107.849 243.298 106.524ZM242.025 114.662 240.941 116.859 239.091 119.979 237.371 118.959 239.201 115.872 239.165 115.939 240.232 113.777ZM237.891 121.694 236.653 123.392 233.529 126.024 232.24 124.494 235.272 121.94 235.109 122.116 236.275 120.516ZM235.853 128.254 226.908 128.214 232.307 121.083Z" transform="matrix(1 0 0 -1 143 457.214)"/><path d="M821 170 821 513.312" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="24" transform="translate(531.587 204)">Weight calibration</text><rect x="461" y="326" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="476" y="351" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(513.235 374)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(499.482 390)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(506.482 406)">weight</text><rect x="476" y="433" width="103" height="48" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(493.898 453)">FP8 scaling</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(505.065 469)">factors</text><rect x="679" y="224" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(716.025 247)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(702.272 263)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(714.192 279)">input</text><rect x="679" y="351" width="103" height="70" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(716.026 374)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(702.272 390)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(709.692 406)">GEMM</text><path d="M0.00641402-0.999979 93.1192-0.402739 93.1064 1.59722-0.00641402 0.999979ZM91.8051-3.41123 99.7793 0.64 91.7538 4.5886Z" transform="matrix(1 0 0 -1 579 386.64)"/><path d="M732 295 732 344.395 730 344.395 730 295ZM735 343.061 731 351.061 727 343.061Z"/><path d="M731.277 421.127 731.042 422.957 730.317 424.869 729.159 426.721 728.823 427.11 727.311 425.801 727.597 425.471 727.505 425.596 728.569 423.893 728.482 424.068 729.131 422.358 729.074 422.586 729.293 420.873ZM727.412 428.692 725.669 430.271 723.36 431.994 722.525 432.487 721.508 430.766 722.297 430.3 722.207 430.359 724.435 428.696 724.362 428.757 726.069 427.21ZM720.803 433.505 717.748 435.309 715.429 436.378 714.592 434.561 716.865 433.514 716.775 433.561 719.786 431.783ZM713.612 437.214 710.817 438.501 708.011 439.551 707.31 437.678 710.082 436.641 710.014 436.669 712.775 435.397ZM706.138 440.253 702.686 441.544 700.423 442.25 699.828 440.34 702.065 439.643 702.012 439.662 705.437 438.38ZM698.513 442.845 693.46 444.42 692.733 444.611 692.224 442.676 692.93 442.491 692.886 442.504 697.918 440.936ZM690.798 445.119 684.995 446.643 684.487 444.709 690.29 443.185ZM683.022 447.153 677.165 448.454 676.731 446.502 682.588 445.2ZM675.212 448.888 672.147 449.57 669.303 450.103 668.935 448.137 671.762 447.607 671.729 447.614 674.778 446.936ZM667.337 450.471 661.44 451.576 661.072 449.61 666.969 448.505ZM659.44 451.924 653.511 452.849 653.203 450.873 659.131 449.948ZM651.535 453.157 647.746 453.748 645.57 454.024 645.318 452.04 647.48 451.766 647.452 451.77 651.227 451.181ZM643.586 454.275 637.633 455.03 637.382 453.046 643.334 452.291ZM635.649 455.282 634.662 455.407 629.654 455.902 629.457 453.912 634.451 453.418 634.424 453.421 635.398 453.298ZM627.664 456.099 621.693 456.689 621.496 454.698 627.467 454.108ZM619.672 456.848 613.687 457.275 613.545 455.28 619.53 454.853ZM611.692 457.417 607.275 457.732 605.677 457.801 605.59 455.803 607.175 455.734 607.147 455.736 611.55 455.422ZM603.679 457.888 597.685 458.149 597.598 456.151 603.592 455.89ZM595.687 458.236 593.193 458.344 589.661 458.397 589.631 456.397 593.15 456.345 593.121 456.346 595.6 456.238ZM587.661 458.426 585.681 458.456 585.651 456.456 587.631 456.427ZM587.058 461.436 579 457.555 586.94 453.436Z"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="24" transform="translate(857.055 204)">FP8 with calibrated scaling factors</text><rect x="868" y="326" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="883" y="351" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(919.685 374)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(905.932 390)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(912.932 406)">weight</text><rect x="883" y="433" width="103" height="48" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#92D050"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(902.185 445)">Calibrated</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(898.599 461)">FP8 scaling</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(911.519 477)">factors</text><rect x="1026" y="363" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1055.14 382)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1044.64 398)">Weight</text><rect x="1131" y="307" width="82" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1159.49 325)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1155.07 341)">Input</text><rect x="1120" y="224" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1157.15 247)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1143.4 263)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1155.32 279)">input</text><rect x="1131" y="363" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1159.48 382)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1150.82 398)">GEMM</text><path d="M0.015735-0.999876 34.0184-0.464776 33.987 1.53498-0.015735 0.999876ZM32.7325-3.48538 40.6686 0.64 32.6066 4.51362Z" transform="matrix(1 0 0 -1 986 386.64)"/><path d="M1108 385 1123.79 385 1123.79 387 1108 387ZM1122.46 382 1130.46 386 1122.46 390Z"/><path d="M1173 351 1173 356.349 1171 356.349 1171 351ZM1176 355.016 1172 363.016 1168 355.016Z"/><path d="M1173 295 1173 300.349 1171 300.349 1171 295ZM1176 299.016 1172 307.016 1168 299.016Z"/><path d="M0.0369111-0.999319 6.03282-0.777852 5.959 1.22079-0.0369111 0.999319ZM8.10061-0.673792 14.0656-0.0265058 13.8498 1.96182 7.88485 1.31454ZM16.1141 0.247765 22.0214 1.2984 21.6712 3.2675 15.7639 2.21686ZM24.0392 1.73603 29.8647 3.17232 29.386 5.11418 23.5604 3.67789ZM31.84 3.77104 37.3437 5.51189 37.6161 5.61786 36.8909 7.48175 36.6487 7.38751 36.7097 7.40901 31.2368 5.67792ZM39.48 6.34307 44.1494 8.15988 45.1058 8.60687 44.259 10.4188 43.3326 9.98577 43.3934 10.0118 38.7547 8.20696ZM46.9177 9.45367 50.5967 11.1731 52.3529 12.1468 51.3831 13.896 49.6571 12.939 49.7186 12.9703 46.0709 11.2656ZM54.1021 13.1167 56.6269 14.5166 59.2925 16.2633 58.1963 17.9361 55.5616 16.2097 55.6248 16.2478 53.1322 14.8658ZM60.9653 17.3595 62.181 18.1561 65.8342 20.9962 64.6067 22.5752 60.9853 19.7599 61.051 19.8068 59.8691 19.0323ZM67.4635 22.304 71.6245 26.1896 71.8939 26.499 70.3854 27.8122 70.15 27.5418 70.2217 27.6161 66.0985 23.7658ZM73.2071 28.0075 75.3941 30.5196 76.9604 32.8279 75.3055 33.9509 73.7729 31.6924 73.8461 31.7875 71.6986 29.3207ZM78.0834 34.4828 78.4463 35.0176 80.7274 39.6775 80.8507 40.1892 78.9063 40.6576 78.8087 40.2527 78.8828 40.4582 76.681 35.9603 76.7517 36.0821 76.4285 35.6058ZM81.3191 42.1336 81.384 42.4029 79.4396 42.8713 79.3747 42.602ZM84.1057 40.6042 81.6141 49.1944 76.2369 42.047Z" transform="matrix(1 0 0 -1 986 457.194)"/><path d="M0.0162082-0.999869 6.01542-0.902619 5.983 1.09712-0.0162082 0.999869ZM8.01516-0.870203 14.0144-0.772954 13.9819 1.22678 7.98274 1.12953ZM16.0141-0.740537 17.4432-0.717371 22.0401-0.498449 21.9449 1.49929 17.3637 1.28111 17.3951 1.28211 15.9817 1.2592ZM24.0378-0.403309 30.031-0.117889 29.9359 1.87985 23.9427 1.59443ZM32.0288-0.0227485 34.7491 0.106806 38.046 0.36414 37.8903 2.35808 34.6086 2.10192 34.6389 2.10382 31.9336 1.97499ZM40.0399 0.519777 46.0217 0.986686 45.8661 2.98062 39.8843 2.51371ZM48.0156 1.14232 51.7818 1.43629 54.021 1.67858 53.8059 3.66697 51.5816 3.42629 51.6113 3.42907 47.86 3.13626ZM56.0094 1.89373 61.9746 2.53919 61.7594 4.52758 55.7942 3.88213ZM63.963 2.75434 68.4058 3.23507 69.9524 3.44976 69.6774 5.43077 68.1457 5.21815 68.1756 5.22184 63.7478 4.74273ZM71.9334 3.72476 77.8764 4.54976 77.6014 6.53076 71.6584 5.70577ZM79.8574 4.82475 84.4855 5.46721 85.8248 5.69592 85.4882 7.66738 84.1642 7.4413 84.195 7.44607 79.5824 6.80576ZM87.7963 6.03256 93.7107 7.0425 93.374 9.01396 87.4596 8.00403ZM95.6821 7.37914 99.8855 8.09689 101.618 8.45234 101.217 10.4115 99.4998 10.0594 99.5324 10.0656 95.3455 9.35061ZM103.578 8.85419 109.455 10.0598 109.053 12.019 103.176 10.8134ZM111.414 10.4616 114.471 11.0885 117.305 11.7781 116.832 13.7214 114.015 13.0361 114.051 13.044 111.013 12.4208ZM119.248 12.251 125.078 13.6696 124.605 15.6129 118.775 14.1942ZM127.021 14.1425 128.106 14.4065 132.839 15.7676 132.286 17.6897 127.573 16.3343 127.613 16.3449 126.548 16.0858ZM134.761 16.3204 140.527 17.9787 139.974 19.9008 134.208 18.2425ZM142.468 18.6339 148.147 20.5714 147.501 22.4642 141.822 20.5268ZM150.04 21.2172 151.991 21.883 155.691 23.3992 154.932 25.2498 151.26 23.745 151.317 23.7661 149.394 23.1101ZM157.541 24.1576 161.965 25.9705 163.116 26.5198 162.255 28.3249 161.129 27.7877 161.181 27.8105 156.783 26.0083ZM164.921 27.3811 166.417 28.0949 170.288 30.1608 169.346 31.9252 165.496 29.8699 165.536 29.8903 164.06 29.1861ZM172.051 31.2128 174.133 32.4706 177.115 34.5431 175.973 36.1855 173.017 34.1311 173.071 34.1659 171.017 32.9246ZM178.738 35.8385 180.177 37.0144 182.575 39.3924 183.152 40.3283 181.449 41.3778 180.935 40.5432 181.082 40.7285 178.803 38.4683 178.875 38.5326 177.473 37.3872ZM184.201 42.0309 184.451 42.436 182.748 43.4854 182.499 43.0803ZM186.867 40.296 185.963 49.1944 179.387 43.132Z" transform="matrix(1 0 0 -1 986 457.194)"/><path d="M0.0156214-0.999878 6.01489-0.90615 5.98365 1.09361-0.0156214 0.999878ZM8.01465-0.874907 14.0139-0.781179 13.9827 1.21858 7.9834 1.12485ZM16.0137-0.749936 22.0129-0.656208 21.9817 1.34355 15.9824 1.24982ZM24.0425-0.610275 30.0362-0.33517 29.9445 1.66273 23.9508 1.38762ZM32.0341-0.243468 38.0278 0.0316382 37.9361 2.02953 31.9424 1.75443ZM40.0257 0.12334 46.0194 0.398446 45.9277 2.39634 39.934 2.12124ZM48.0445 0.523202 54.0276 0.973298 53.8776 2.96766 47.8945 2.51757ZM56.022 1.12333 62.0051 1.57343 61.855 3.56779 55.872 3.1177ZM63.9994 1.72346 69.9786 2.17326 70.0112 2.17666 69.8037 4.16587 69.7855 4.16396 69.8142 4.16654 63.8494 3.71782ZM72.0004 2.3841 77.968 3.00644 77.7606 4.99565 71.793 4.37332ZM79.9573 3.21389 85.9249 3.83622 85.7175 5.82544 79.7498 5.2031ZM87.9141 4.04367 92.4444 4.51611 93.9056 4.71161 93.6404 6.69394 92.1936 6.50038 92.2225 6.50382 87.7067 6.03288ZM95.8879 4.97682 101.835 5.77248 101.57 7.75481 95.6227 6.95916ZM103.817 6.03769 109.764 6.83334 109.499 8.81568 103.552 8.02003ZM111.747 7.09856 114.174 7.42333 117.708 8.00485 117.383 9.97831 113.864 9.39923 113.894 9.40367 111.481 9.0809ZM119.681 8.32963 125.601 9.30397 125.277 11.2774 119.356 10.3031ZM127.575 9.62875 133.495 10.6031 133.17 12.5765 127.25 11.6022ZM135.497 10.9496 141.383 12.1131 140.996 14.0752 135.109 12.9116ZM143.345 12.501 149.232 13.6646 148.844 15.6266 142.958 14.463ZM151.194 14.0524 154.692 14.744 157.096 15.3078 156.639 17.255 154.252 16.6952 154.287 16.7026 150.806 16.0145ZM159.043 15.7645 164.885 17.1344 164.428 19.0816 158.586 17.7116ZM166.832 17.5911 172.673 18.9611 172.217 20.9083 166.375 19.5383ZM174.638 19.4697 180.436 21.011 179.923 22.9439 174.124 21.4025ZM182.386 21.5434 188.15 23.2095 187.595 25.1309 181.831 23.4647ZM190.094 23.7721 195.817 25.5738 195.217 27.4815 189.494 25.6798ZM197.725 26.1743 197.937 26.241 203.426 28.1274 202.776 30.0188 197.299 28.1367 197.324 28.1448 197.125 28.082ZM205.317 28.7775 205.372 28.7962 210.96 30.9008 210.255 32.7724 204.681 30.673 204.708 30.6829 204.667 30.6689ZM212.856 31.6341 218.398 33.9327 217.632 35.7801 212.09 33.4815ZM220.257 34.7672 224.842 36.8757 225.729 37.3319 224.814 39.1104 223.947 38.6643 223.987 38.6836 219.421 36.5843ZM227.508 38.2468 230.314 39.6905 232.817 41.1479 231.81 42.8763 229.33 41.432 229.376 41.457 226.593 40.0253ZM234.545 42.1544 235.24 42.5592 239.599 45.4789 239.669 45.5339 238.432 47.1052 238.392 47.074 238.454 47.1192 234.153 44.2384 234.207 44.2717 233.539 43.8827ZM241.24 46.7714 243.37 48.4484 245.817 50.7879 244.435 52.2335 242.022 49.9271 242.095 49.9899 240.003 48.3426ZM247.249 52.3466 249.052 54.5432 250.739 57.3923 249.018 58.4114 247.37 55.6281 247.457 55.7529 245.703 53.6153ZM251.537 59.4264 252.056 60.8708 252.444 64.0697 252.369 65.7033 250.371 65.6112 250.443 64.0611 250.449 64.2276 250.085 61.2235 250.136 61.4415 249.655 60.1032ZM252.277 67.7012 252.001 73.6948 250.003 73.6028 250.279 67.6091ZM251.909 75.6927 251.888 76.1544 251.138 81.7297 249.156 81.4632 249.9 75.9313 249.892 76.0185 249.911 75.6006ZM250.871 83.7119 250.319 87.8223 249.896 89.7144 247.944 89.2783 248.357 87.4283 248.342 87.5131 248.889 83.4453ZM249.46 91.6663 248.152 97.5219 246.2 97.0858 247.508 91.2302ZM247.645 99.5265 246.377 103.802 245.848 105.302 243.962 104.638 244.482 103.161 244.466 103.209 245.728 98.9575ZM245.184 107.189 244.709 108.539 242.933 112.808 241.086 112.04 242.852 107.796 242.832 107.849 243.298 106.524ZM242.025 114.662 240.941 116.859 239.091 119.979 237.371 118.959 239.201 115.872 239.165 115.939 240.232 113.777ZM237.891 121.694 236.653 123.392 233.529 126.024 232.24 124.494 235.272 121.94 235.109 122.116 236.275 120.516ZM235.853 128.254 226.908 128.214 232.307 121.083Z" transform="matrix(1 0 0 -1 986 457.214)"/><path d="M422 170 422 513.312" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/media/calibration_1_half.svg b/docs/examples/te_gemma/media/calibration_1_half.svg
new file mode 100644
index 0000000000..af2641387f
--- /dev/null
+++ b/docs/examples/te_gemma/media/calibration_1_half.svg
@@ -0,0 +1 @@
+<svg width="960" height="480" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="960" height="480" fill="#FFFFFF"/><rect x="81" y="206" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="96" y="231" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(133.202 254)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(119.448 270)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(126.448 286)">weight</text><rect x="96" y="313" width="103" height="48" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#F7CBCB"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(130.702 325)">Initial</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(112.115 341)">FP8 scaling</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(125.035 357)">factors</text><rect x="240" y="243" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(268.651 262)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(258.151 278)">Weight</text><rect x="344" y="187" width="82" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(373.003 205)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(368.583 221)">Input</text><rect x="334" y="104" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(370.67 127)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(356.917 143)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(368.837 159)">input</text><rect x="344" y="243" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(373 262)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(364.333 278)">GEMM</text><path d="M0.015735-0.999876 34.0184-0.464776 33.987 1.53498-0.015735 0.999876ZM32.7325-3.48538 40.6686 0.64 32.6066 4.51362Z" transform="matrix(1 0 0 -1 199 266.64)"/><path d="M322 265 337.791 265 337.791 267 322 267ZM336.458 262 344.458 266 336.458 270Z"/><path d="M386 231 386 236.349 384 236.349 384 231ZM389 235.016 385 243.016 381 235.016Z"/><path d="M386 175 386 180.349 384 180.349 384 175ZM389 179.016 385 187.016 381 179.016Z"/><path d="M302.844 247 306.907 252.911 305.231 253.608 310.349 258.673 308.673 259.519 315 268 304.27 261.501 306.317 260.598 299.65 256.435 302.039 255.149 295 250.782Z" fill="#FF0000" fill-rule="evenodd"/><path d="M408.452 190 412.312 195.63 410.72 196.294 415.582 201.118 413.99 201.923 420 210 409.807 203.81 411.751 202.951 405.418 198.986 407.687 197.761 401 193.602Z" fill="#FF0000" fill-rule="evenodd"/><path d="M409.452 247 413.312 252.63 411.72 253.294 416.582 258.118 414.99 258.923 421 267 410.807 260.81 412.751 259.951 406.418 255.986 408.687 254.761 402 250.602Z" fill="#FF0000" fill-rule="evenodd"/><path d="M0.0369111-0.999319 6.03282-0.777852 5.959 1.22079-0.0369111 0.999319ZM8.10061-0.673792 14.0656-0.0265058 13.8498 1.96182 7.88485 1.31454ZM16.1141 0.247765 22.0214 1.2984 21.6712 3.2675 15.7639 2.21686ZM24.0392 1.73603 29.8647 3.17232 29.386 5.11418 23.5604 3.67789ZM31.84 3.77104 37.3437 5.51189 37.6161 5.61786 36.8909 7.48175 36.6487 7.38751 36.7097 7.40901 31.2368 5.67792ZM39.48 6.34307 44.1494 8.15988 45.1058 8.60687 44.259 10.4188 43.3326 9.98577 43.3934 10.0118 38.7547 8.20696ZM46.9177 9.45367 50.5967 11.1731 52.3529 12.1468 51.3831 13.896 49.6571 12.939 49.7186 12.9703 46.0709 11.2656ZM54.1021 13.1167 56.6269 14.5166 59.2925 16.2633 58.1963 17.9361 55.5616 16.2097 55.6248 16.2478 53.1322 14.8658ZM60.9653 17.3595 62.181 18.1561 65.8342 20.9962 64.6067 22.5752 60.9853 19.7599 61.051 19.8068 59.8691 19.0323ZM67.4635 22.304 71.6245 26.1896 71.8939 26.499 70.3854 27.8122 70.15 27.5418 70.2217 27.6161 66.0985 23.7658ZM73.2071 28.0075 75.3941 30.5196 76.9604 32.8279 75.3055 33.9509 73.7729 31.6924 73.8461 31.7875 71.6986 29.3207ZM78.0834 34.4828 78.4463 35.0176 80.7274 39.6775 80.8507 40.1892 78.9063 40.6576 78.8087 40.2527 78.8828 40.4582 76.681 35.9603 76.7517 36.0821 76.4285 35.6058ZM81.3191 42.1336 81.384 42.4029 79.4396 42.8713 79.3747 42.602ZM84.1057 40.6042 81.6141 49.1944 76.2369 42.047Z" transform="matrix(1 0 0 -1 199 337.194)"/><path d="M0.0162082-0.999869 6.01542-0.902619 5.983 1.09712-0.0162082 0.999869ZM8.01516-0.870203 14.0144-0.772953 13.982 1.22678 7.98274 1.12953ZM16.0141-0.740537 17.4432-0.717371 22.0401-0.498449 21.945 1.49929 17.3637 1.28111 17.3951 1.28211 15.9817 1.2592ZM24.0378-0.403308 30.031-0.117888 29.9359 1.87985 23.9427 1.59443ZM32.0288-0.0227477 34.7491 0.106806 38.046 0.364142 37.8903 2.35808 34.6086 2.10192 34.6388 2.10382 31.9336 1.97499ZM40.0399 0.519778 46.0217 0.986688 45.8661 2.98062 39.8843 2.51371ZM48.0156 1.14232 51.7818 1.43629 54.021 1.67858 53.8059 3.66698 51.5815 3.42629 51.6113 3.42907 47.86 3.13626ZM56.0094 1.89373 61.9746 2.53919 61.7594 4.52759 55.7943 3.88213ZM63.963 2.75434 68.4058 3.23507 69.9524 3.44977 69.6774 5.43077 68.1457 5.21815 68.1756 5.22184 63.7478 4.74274ZM71.9334 3.72477 77.8764 4.54976 77.6014 6.53077 71.6584 5.70577ZM79.8574 4.82476 84.4854 5.46721 85.8248 5.69593 85.4882 7.66739 84.1641 7.4413 84.195 7.44607 79.5824 6.80576ZM87.7963 6.03257 93.7107 7.0425 93.374 9.01397 87.4596 8.00403ZM95.6821 7.37915 99.8854 8.09689 101.618 8.45235 101.217 10.4116 99.4998 10.0594 99.5324 10.0656 95.3455 9.35061ZM103.578 8.8542 109.455 10.0598 109.053 12.019 103.176 10.8134ZM111.414 10.4616 114.471 11.0885 117.305 11.7781 116.832 13.7214 114.015 13.0361 114.051 13.044 111.013 12.4208ZM119.248 12.251 125.078 13.6696 124.605 15.6129 118.775 14.1943ZM127.021 14.1425 128.106 14.4065 132.839 15.7676 132.286 17.6897 127.573 16.3343 127.613 16.3449 126.548 16.0858ZM134.761 16.3204 140.527 17.9787 139.974 19.9008 134.208 18.2425ZM142.468 18.6339 148.147 20.5714 147.501 22.4643 141.822 20.5268ZM150.04 21.2172 151.991 21.883 155.691 23.3992 154.932 25.2498 151.26 23.745 151.317 23.7661 149.394 23.1101ZM157.541 24.1577 161.965 25.9705 163.116 26.5198 162.255 28.3249 161.129 27.7877 161.18 27.8105 156.783 26.0083ZM164.921 27.3811 166.417 28.0949 170.288 30.1608 169.346 31.9252 165.496 29.8699 165.536 29.8903 164.06 29.1862ZM172.051 31.2128 174.133 32.4706 177.115 34.5432 175.973 36.1855 173.017 34.1311 173.071 34.1659 171.017 32.9246ZM178.738 35.8386 180.177 37.0144 182.575 39.3924 183.152 40.3284 181.449 41.3778 180.935 40.5432 181.082 40.7285 178.803 38.4683 178.875 38.5326 177.473 37.3872ZM184.201 42.0309 184.451 42.436 182.748 43.4854 182.499 43.0804ZM186.867 40.296 185.963 49.1944 179.387 43.1319Z" transform="matrix(1 0 0 -1 199 337.194)"/><path d="M0.0156214-0.999878 6.01489-0.90615 5.98365 1.09361-0.0156214 0.999878ZM8.01465-0.874907 14.0139-0.781179 13.9827 1.21858 7.9834 1.12485ZM16.0137-0.749936 22.0129-0.656208 21.9817 1.34355 15.9824 1.24982ZM24.0425-0.610275 30.0362-0.33517 29.9445 1.66273 23.9508 1.38762ZM32.0341-0.243468 38.0278 0.0316382 37.9361 2.02953 31.9424 1.75443ZM40.0257 0.12334 46.0194 0.398446 45.9277 2.39634 39.934 2.12124ZM48.0445 0.523202 54.0276 0.973298 53.8776 2.96766 47.8945 2.51757ZM56.022 1.12333 62.0051 1.57343 61.855 3.56779 55.872 3.1177ZM63.9994 1.72346 69.9786 2.17326 70.0112 2.17666 69.8037 4.16587 69.7855 4.16396 69.8142 4.16654 63.8494 3.71782ZM72.0004 2.3841 77.968 3.00644 77.7606 4.99565 71.793 4.37332ZM79.9573 3.21389 85.9249 3.83622 85.7175 5.82544 79.7498 5.2031ZM87.9141 4.04367 92.4444 4.51611 93.9056 4.71161 93.6404 6.69394 92.1936 6.50038 92.2225 6.50382 87.7067 6.03288ZM95.8879 4.97682 101.835 5.77248 101.57 7.75481 95.6227 6.95916ZM103.817 6.03769 109.764 6.83334 109.499 8.81568 103.552 8.02003ZM111.747 7.09856 114.174 7.42333 117.708 8.00485 117.383 9.97831 113.864 9.39923 113.894 9.40367 111.481 9.0809ZM119.681 8.32963 125.601 9.30397 125.277 11.2774 119.356 10.3031ZM127.575 9.62875 133.495 10.6031 133.17 12.5765 127.25 11.6022ZM135.497 10.9496 141.383 12.1131 140.996 14.0752 135.109 12.9116ZM143.345 12.501 149.232 13.6646 148.844 15.6266 142.958 14.463ZM151.194 14.0524 154.692 14.744 157.096 15.3078 156.639 17.255 154.252 16.6952 154.287 16.7026 150.806 16.0145ZM159.043 15.7645 164.885 17.1344 164.428 19.0816 158.586 17.7116ZM166.832 17.5911 172.673 18.9611 172.217 20.9083 166.375 19.5383ZM174.638 19.4697 180.436 21.011 179.923 22.9439 174.124 21.4025ZM182.386 21.5434 188.15 23.2095 187.595 25.1309 181.831 23.4647ZM190.094 23.7721 195.817 25.5738 195.217 27.4815 189.494 25.6798ZM197.725 26.1743 197.937 26.241 203.426 28.1274 202.776 30.0188 197.299 28.1367 197.324 28.1448 197.125 28.082ZM205.317 28.7775 205.372 28.7962 210.96 30.9008 210.255 32.7724 204.681 30.673 204.708 30.6829 204.667 30.6689ZM212.856 31.6341 218.398 33.9327 217.632 35.7801 212.09 33.4815ZM220.257 34.7672 224.842 36.8757 225.729 37.3319 224.814 39.1104 223.947 38.6643 223.987 38.6836 219.421 36.5843ZM227.508 38.2468 230.314 39.6905 232.817 41.1479 231.81 42.8763 229.33 41.432 229.376 41.457 226.593 40.0253ZM234.545 42.1544 235.24 42.5592 239.599 45.4789 239.669 45.5339 238.432 47.1052 238.392 47.074 238.454 47.1192 234.153 44.2384 234.207 44.2717 233.539 43.8827ZM241.24 46.7714 243.37 48.4484 245.817 50.7879 244.435 52.2335 242.022 49.9271 242.095 49.9899 240.003 48.3426ZM247.249 52.3466 249.052 54.5432 250.739 57.3923 249.018 58.4114 247.37 55.6281 247.457 55.7529 245.703 53.6153ZM251.537 59.4264 252.056 60.8708 252.444 64.0697 252.369 65.7033 250.371 65.6112 250.443 64.0611 250.449 64.2276 250.085 61.2235 250.136 61.4415 249.655 60.1032ZM252.277 67.7012 252.001 73.6948 250.003 73.6028 250.279 67.6091ZM251.909 75.6927 251.888 76.1544 251.138 81.7297 249.156 81.4632 249.9 75.9313 249.892 76.0185 249.911 75.6006ZM250.871 83.7119 250.319 87.8223 249.896 89.7144 247.944 89.2783 248.357 87.4283 248.342 87.5131 248.889 83.4453ZM249.46 91.6663 248.152 97.5219 246.2 97.0858 247.508 91.2302ZM247.645 99.5265 246.377 103.802 245.848 105.302 243.962 104.638 244.482 103.161 244.466 103.209 245.728 98.9575ZM245.184 107.189 244.709 108.539 242.933 112.808 241.086 112.04 242.852 107.796 242.832 107.849 243.298 106.524ZM242.025 114.662 240.941 116.859 239.091 119.979 237.371 118.959 239.201 115.872 239.165 115.939 240.232 113.777ZM237.891 121.694 236.653 123.392 233.529 126.024 232.24 124.494 235.272 121.94 235.109 122.116 236.275 120.516ZM235.853 128.254 226.908 128.214 232.307 121.083Z" transform="matrix(1 0 0 -1 199 337.214)"/><rect x="518" y="206" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="533" y="231" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(569.617 254)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(555.863 270)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(562.863 286)">weight</text><rect x="533" y="313" width="103" height="48" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(550.28 333)">FP8 scaling</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(561.447 349)">factors</text><rect x="735" y="104" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(772.407 127)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(758.653 143)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(770.573 159)">input</text><rect x="735" y="231" width="103" height="70" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(772.407 254)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(758.653 270)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(766.073 286)">GEMM</text><path d="M0.00641402-0.999979 93.1192-0.402739 93.1064 1.59722-0.00641402 0.999979ZM91.8051-3.41123 99.7793 0.64 91.7538 4.5886Z" transform="matrix(1 0 0 -1 636 266.64)"/><path d="M788 175 788 224.395 786 224.395 786 175ZM791 223.061 787 231.061 783 223.061Z"/><path d="M788.277 301.127 788.042 302.957 787.317 304.869 786.16 306.721 785.823 307.11 784.311 305.801 784.597 305.471 784.505 305.595 785.569 303.893 785.482 304.068 786.131 302.358 786.074 302.586 786.293 300.873ZM784.412 308.692 782.669 310.271 780.361 311.994 779.525 312.487 778.508 310.765 779.297 310.3 779.207 310.359 781.435 308.696 781.362 308.757 783.069 307.21ZM777.803 313.505 774.748 315.309 772.429 316.378 771.592 314.561 773.866 313.514 773.775 313.561 776.786 311.783ZM770.612 317.214 767.817 318.501 765.011 319.551 764.31 317.678 767.082 316.641 767.014 316.669 769.776 315.397ZM763.138 320.253 759.686 321.544 757.423 322.25 756.828 320.34 759.065 319.643 759.012 319.662 762.437 318.38ZM755.513 322.845 750.46 324.42 749.733 324.611 749.224 322.676 749.93 322.491 749.886 322.504 754.918 320.936ZM747.798 325.119 741.995 326.643 741.487 324.709 747.29 323.185ZM740.022 327.153 734.165 328.454 733.731 326.502 739.588 325.2ZM732.212 328.888 729.147 329.57 726.303 330.103 725.935 328.137 728.762 327.607 728.729 327.614 731.778 326.936ZM724.337 330.471 718.44 331.576 718.072 329.61 723.969 328.505ZM716.44 331.924 710.511 332.849 710.203 330.873 716.132 329.948ZM708.535 333.157 704.746 333.748 702.57 334.024 702.318 332.04 704.481 331.766 704.452 331.77 708.227 331.181ZM700.586 334.275 694.634 335.03 694.382 333.046 700.334 332.291ZM692.649 335.282 691.662 335.407 686.654 335.902 686.457 333.912 691.452 333.418 691.424 333.421 692.398 333.298ZM684.664 336.099 678.693 336.689 678.496 334.698 684.467 334.108ZM676.672 336.848 670.687 337.275 670.545 335.28 676.53 334.853ZM668.692 337.417 664.275 337.732 662.677 337.801 662.59 335.803 664.175 335.734 664.147 335.736 668.55 335.422ZM660.679 337.888 654.685 338.149 654.598 336.151 660.592 335.89ZM652.687 338.236 650.194 338.345 646.661 338.397 646.631 336.397 650.15 336.345 650.121 336.346 652.6 336.238ZM644.661 338.426 642.681 338.456 642.651 336.456 644.631 336.427ZM644.058 341.436 636 337.555 643.94 333.437Z"/><path d="M479 50 479 393.312" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="22" transform="translate(105.552 72)">FP8 with initial scaling factors</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="22" transform="translate(591.229 72)">Weight calibration</text></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/media/calibration_2_half.svg b/docs/examples/te_gemma/media/calibration_2_half.svg
new file mode 100644
index 0000000000..2d56f7d434
--- /dev/null
+++ b/docs/examples/te_gemma/media/calibration_2_half.svg
@@ -0,0 +1 @@
+<svg width="960" height="480" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="960" height="480" fill="#FFFFFF"/><path d="M446 56 446 399.312" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="22" transform="translate(166.41 88)">Weight calibration</text><rect x="87" y="211" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="102" y="236" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(138.558 260)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(124.805 276)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(131.805 292)">weight</text><rect x="102" y="319" width="103" height="48" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(119.222 339)">FP8 scaling</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(130.388 355)">factors</text><rect x="304" y="109" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(341.349 132)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(327.595 148)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(339.515 164)">input</text><rect x="304" y="236" width="103" height="70" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(341.348 259)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(327.595 275)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(335.015 291)">GEMM</text><path d="M0.00641402-0.999979 93.1192-0.402739 93.1064 1.59722-0.00641402 0.999979ZM91.8051-3.41123 99.7793 0.64 91.7538 4.5886Z" transform="matrix(1 0 0 -1 205 271.64)"/><path d="M357 180 357 229.395 355 229.395 355 180ZM360 228.061 356 236.061 352 228.061Z"/><path d="M357.277 306.127 357.042 307.957 356.317 309.869 355.16 311.721 354.823 312.11 353.311 310.801 353.597 310.471 353.505 310.595 354.569 308.893 354.482 309.068 355.131 307.358 355.074 307.586 355.293 305.873ZM353.412 313.692 351.669 315.271 349.361 316.994 348.525 317.487 347.508 315.765 348.297 315.3 348.207 315.359 350.435 313.696 350.362 313.757 352.069 312.21ZM346.803 318.505 343.748 320.309 341.429 321.378 340.592 319.561 342.866 318.514 342.775 318.561 345.786 316.783ZM339.612 322.214 336.817 323.501 334.011 324.551 333.31 322.678 336.082 321.641 336.014 321.669 338.775 320.397ZM332.138 325.253 328.686 326.544 326.423 327.25 325.828 325.34 328.065 324.643 328.012 324.662 331.437 323.38ZM324.513 327.845 319.46 329.42 318.733 329.611 318.224 327.676 318.93 327.491 318.886 327.504 323.918 325.936ZM316.798 330.119 310.995 331.643 310.487 329.709 316.29 328.185ZM309.022 332.153 303.165 333.454 302.731 331.502 308.588 330.2ZM301.212 333.888 298.147 334.57 295.303 335.103 294.935 333.137 297.762 332.607 297.729 332.614 300.778 331.936ZM293.337 335.471 287.44 336.576 287.072 334.61 292.969 333.505ZM285.44 336.924 279.511 337.849 279.203 335.873 285.132 334.948ZM277.535 338.157 273.746 338.748 271.57 339.024 271.318 337.04 273.481 336.766 273.452 336.77 277.227 336.181ZM269.586 339.275 263.633 340.03 263.382 338.046 269.334 337.291ZM261.649 340.282 260.662 340.407 255.654 340.902 255.457 338.912 260.452 338.418 260.424 338.421 261.398 338.298ZM253.664 341.099 247.693 341.689 247.496 339.698 253.467 339.108ZM245.672 341.848 239.687 342.275 239.545 340.28 245.53 339.853ZM237.692 342.417 233.275 342.732 231.677 342.801 231.59 340.803 233.175 340.734 233.147 340.736 237.55 340.422ZM229.679 342.888 223.685 343.149 223.598 341.151 229.592 340.89ZM221.687 343.236 219.194 343.345 215.661 343.397 215.631 341.397 219.15 341.345 219.121 341.346 221.6 341.238ZM213.661 343.426 211.681 343.456 211.651 341.456 213.631 341.427ZM213.058 346.436 205 342.555 212.94 338.437Z"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="22" transform="translate(500.235 88)">FP8 with calibrated scaling factors</text><rect x="493" y="211" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="508" y="236" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(545.009 260)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(531.255 276)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(538.255 292)">weight</text><rect x="508" y="319" width="103" height="48" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#92D050"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(527.509 331)">Calibrated</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(523.922 347)">FP8 scaling</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(536.842 363)">factors</text><rect x="652" y="249" width="82" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(680.458 267)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(669.958 283)">Weight</text><rect x="756" y="192" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(784.81 210)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(780.39 226)">Input</text><rect x="745" y="109" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(782.477 132)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(768.723 148)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(780.643 164)">input</text><rect x="756" y="249" width="82" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(784.807 267)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(776.14 283)">GEMM</text><path d="M0.015735-0.999876 34.0184-0.464776 33.987 1.53498-0.015735 0.999876ZM32.7325-3.48538 40.6686 0.64 32.6066 4.51362Z" transform="matrix(1 0 0 -1 611 271.64)"/><path d="M734 270 749.791 270 749.791 272 734 272ZM748.458 267 756.458 271 748.458 275Z"/><path d="M798 237 798 242.349 796 242.349 796 237ZM801 241.016 797 249.016 793 241.016Z"/><path d="M798 180 798 185.349 796 185.349 796 180ZM801 184.016 797 192.016 793 184.016Z"/><path d="M0.0369111-0.999319 6.03282-0.777852 5.959 1.22079-0.0369111 0.999319ZM8.10061-0.673792 14.0656-0.0265058 13.8498 1.96182 7.88485 1.31454ZM16.1141 0.247765 22.0214 1.2984 21.6712 3.2675 15.7639 2.21686ZM24.0392 1.73603 29.8647 3.17232 29.386 5.11418 23.5604 3.67789ZM31.84 3.77104 37.3437 5.51189 37.6161 5.61786 36.8909 7.48175 36.6487 7.38751 36.7097 7.40901 31.2368 5.67792ZM39.48 6.34307 44.1494 8.15988 45.1058 8.60687 44.259 10.4188 43.3326 9.98577 43.3934 10.0118 38.7547 8.20696ZM46.9177 9.45367 50.5967 11.1731 52.3529 12.1468 51.3831 13.896 49.6571 12.939 49.7186 12.9703 46.0709 11.2656ZM54.1021 13.1167 56.6269 14.5166 59.2925 16.2633 58.1963 17.9361 55.5616 16.2097 55.6248 16.2478 53.1322 14.8658ZM60.9653 17.3595 62.181 18.1561 65.8342 20.9962 64.6067 22.5752 60.9853 19.7599 61.051 19.8068 59.8691 19.0323ZM67.4635 22.304 71.6245 26.1896 71.8939 26.499 70.3854 27.8122 70.15 27.5418 70.2217 27.6161 66.0985 23.7658ZM73.2071 28.0075 75.3941 30.5196 76.9604 32.8279 75.3055 33.9509 73.7729 31.6924 73.8461 31.7875 71.6986 29.3207ZM78.0834 34.4828 78.4463 35.0176 80.7274 39.6775 80.8507 40.1892 78.9063 40.6576 78.8087 40.2527 78.8828 40.4582 76.681 35.9603 76.7517 36.0821 76.4285 35.6058ZM81.3191 42.1336 81.384 42.4029 79.4396 42.8713 79.3747 42.602ZM84.1057 40.6042 81.6141 49.1944 76.2369 42.047Z" transform="matrix(1 0 0 -1 611 342.194)"/><path d="M0.0162082-0.999869 6.01542-0.902619 5.983 1.09712-0.0162082 0.999869ZM8.01516-0.870203 14.0144-0.772953 13.982 1.22678 7.98274 1.12953ZM16.0141-0.740537 17.4432-0.717371 22.0401-0.498449 21.945 1.49929 17.3637 1.28111 17.3951 1.28211 15.9817 1.2592ZM24.0378-0.403308 30.031-0.117888 29.9359 1.87985 23.9427 1.59443ZM32.0288-0.0227477 34.7491 0.106806 38.046 0.364142 37.8903 2.35808 34.6086 2.10192 34.6388 2.10382 31.9336 1.97499ZM40.0399 0.519778 46.0217 0.986688 45.8661 2.98062 39.8843 2.51371ZM48.0156 1.14232 51.7818 1.43629 54.021 1.67858 53.8059 3.66698 51.5815 3.42629 51.6113 3.42907 47.86 3.13626ZM56.0094 1.89373 61.9746 2.53919 61.7594 4.52759 55.7943 3.88213ZM63.963 2.75434 68.4058 3.23507 69.9524 3.44977 69.6774 5.43077 68.1457 5.21815 68.1756 5.22184 63.7478 4.74274ZM71.9334 3.72477 77.8764 4.54976 77.6014 6.53077 71.6584 5.70577ZM79.8574 4.82476 84.4854 5.46721 85.8248 5.69593 85.4882 7.66739 84.1641 7.4413 84.195 7.44607 79.5824 6.80576ZM87.7963 6.03257 93.7107 7.0425 93.374 9.01397 87.4596 8.00403ZM95.6821 7.37915 99.8854 8.09689 101.618 8.45235 101.217 10.4116 99.4998 10.0594 99.5324 10.0656 95.3455 9.35061ZM103.578 8.8542 109.455 10.0598 109.053 12.019 103.176 10.8134ZM111.414 10.4616 114.471 11.0885 117.305 11.7781 116.832 13.7214 114.015 13.0361 114.051 13.044 111.013 12.4208ZM119.248 12.251 125.078 13.6696 124.605 15.6129 118.775 14.1943ZM127.021 14.1425 128.106 14.4065 132.839 15.7676 132.286 17.6897 127.573 16.3343 127.613 16.3449 126.548 16.0858ZM134.761 16.3204 140.527 17.9787 139.974 19.9008 134.208 18.2425ZM142.468 18.6339 148.147 20.5714 147.501 22.4643 141.822 20.5268ZM150.04 21.2172 151.991 21.883 155.691 23.3992 154.932 25.2498 151.26 23.745 151.317 23.7661 149.394 23.1101ZM157.541 24.1577 161.965 25.9705 163.116 26.5198 162.255 28.3249 161.129 27.7877 161.18 27.8105 156.783 26.0083ZM164.921 27.3811 166.417 28.0949 170.288 30.1608 169.346 31.9252 165.496 29.8699 165.536 29.8903 164.06 29.1862ZM172.051 31.2128 174.133 32.4706 177.115 34.5432 175.973 36.1855 173.017 34.1311 173.071 34.1659 171.017 32.9246ZM178.738 35.8386 180.177 37.0144 182.575 39.3924 183.152 40.3284 181.449 41.3778 180.935 40.5432 181.082 40.7285 178.803 38.4683 178.875 38.5326 177.473 37.3872ZM184.201 42.0309 184.451 42.436 182.748 43.4854 182.499 43.0804ZM186.867 40.296 185.963 49.1944 179.387 43.1319Z" transform="matrix(1 0 0 -1 611 342.194)"/><path d="M0.0156214-0.999878 6.01489-0.90615 5.98365 1.09361-0.0156214 0.999878ZM8.01465-0.874907 14.0139-0.781179 13.9827 1.21858 7.9834 1.12485ZM16.0137-0.749936 22.0129-0.656208 21.9817 1.34355 15.9824 1.24982ZM24.0425-0.610275 30.0362-0.33517 29.9445 1.66273 23.9508 1.38762ZM32.0341-0.243468 38.0278 0.0316382 37.9361 2.02953 31.9424 1.75443ZM40.0257 0.12334 46.0194 0.398446 45.9277 2.39634 39.934 2.12124ZM48.0445 0.523202 54.0276 0.973298 53.8776 2.96766 47.8945 2.51757ZM56.022 1.12333 62.0051 1.57343 61.855 3.56779 55.872 3.1177ZM63.9994 1.72346 69.9786 2.17326 70.0112 2.17666 69.8037 4.16587 69.7855 4.16396 69.8142 4.16654 63.8494 3.71782ZM72.0004 2.3841 77.968 3.00644 77.7606 4.99565 71.793 4.37332ZM79.9573 3.21389 85.9249 3.83622 85.7175 5.82544 79.7498 5.2031ZM87.9141 4.04367 92.4444 4.51611 93.9056 4.71161 93.6404 6.69394 92.1936 6.50038 92.2225 6.50382 87.7067 6.03288ZM95.8879 4.97682 101.835 5.77248 101.57 7.75481 95.6227 6.95916ZM103.817 6.03769 109.764 6.83334 109.499 8.81568 103.552 8.02003ZM111.747 7.09856 114.174 7.42333 117.708 8.00485 117.383 9.97831 113.864 9.39923 113.894 9.40367 111.481 9.0809ZM119.681 8.32963 125.601 9.30397 125.277 11.2774 119.356 10.3031ZM127.575 9.62875 133.495 10.6031 133.17 12.5765 127.25 11.6022ZM135.497 10.9496 141.383 12.1131 140.996 14.0752 135.109 12.9116ZM143.345 12.501 149.232 13.6646 148.844 15.6266 142.958 14.463ZM151.194 14.0524 154.692 14.744 157.096 15.3078 156.639 17.255 154.252 16.6952 154.287 16.7026 150.806 16.0145ZM159.043 15.7645 164.885 17.1344 164.428 19.0816 158.586 17.7116ZM166.832 17.5911 172.673 18.9611 172.217 20.9083 166.375 19.5383ZM174.638 19.4697 180.436 21.011 179.923 22.9439 174.124 21.4025ZM182.386 21.5434 188.15 23.2095 187.595 25.1309 181.831 23.4647ZM190.094 23.7721 195.817 25.5738 195.217 27.4815 189.494 25.6798ZM197.725 26.1743 197.937 26.241 203.426 28.1274 202.776 30.0188 197.299 28.1367 197.324 28.1448 197.125 28.082ZM205.317 28.7775 205.372 28.7962 210.96 30.9008 210.255 32.7724 204.681 30.673 204.708 30.6829 204.667 30.6689ZM212.856 31.6341 218.398 33.9327 217.632 35.7801 212.09 33.4815ZM220.257 34.7672 224.842 36.8757 225.729 37.3319 224.814 39.1104 223.947 38.6643 223.987 38.6836 219.421 36.5843ZM227.508 38.2468 230.314 39.6905 232.817 41.1479 231.81 42.8763 229.33 41.432 229.376 41.457 226.593 40.0253ZM234.545 42.1544 235.24 42.5592 239.599 45.4789 239.669 45.5339 238.432 47.1052 238.392 47.074 238.454 47.1192 234.153 44.2384 234.207 44.2717 233.539 43.8827ZM241.24 46.7714 243.37 48.4484 245.817 50.7879 244.435 52.2335 242.022 49.9271 242.095 49.9899 240.003 48.3426ZM247.249 52.3466 249.052 54.5432 250.739 57.3923 249.018 58.4114 247.37 55.6281 247.457 55.7529 245.703 53.6153ZM251.537 59.4264 252.056 60.8708 252.444 64.0697 252.369 65.7033 250.371 65.6112 250.443 64.0611 250.449 64.2276 250.085 61.2235 250.136 61.4415 249.655 60.1032ZM252.277 67.7012 252.001 73.6948 250.003 73.6028 250.279 67.6091ZM251.909 75.6927 251.888 76.1544 251.138 81.7297 249.156 81.4632 249.9 75.9313 249.892 76.0185 249.911 75.6006ZM250.871 83.7119 250.319 87.8223 249.896 89.7144 247.944 89.2783 248.357 87.4283 248.342 87.5131 248.889 83.4453ZM249.46 91.6663 248.152 97.5219 246.2 97.0858 247.508 91.2302ZM247.645 99.5265 246.377 103.802 245.848 105.302 243.962 104.638 244.482 103.161 244.466 103.209 245.728 98.9575ZM245.184 107.189 244.709 108.539 242.933 112.808 241.086 112.04 242.852 107.796 242.832 107.849 243.298 106.524ZM242.025 114.662 240.941 116.859 239.091 119.979 237.371 118.959 239.201 115.872 239.165 115.939 240.232 113.777ZM237.891 121.694 236.653 123.392 233.529 126.024 232.24 124.494 235.272 121.94 235.109 122.116 236.275 120.516ZM235.853 128.254 226.908 128.214 232.307 121.083Z" transform="matrix(1 0 0 -1 611 342.214)"/></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/media/fp8_model_init.svg b/docs/examples/te_gemma/media/fp8_model_init.svg
new file mode 100644
index 0000000000..c7fce2120d
--- /dev/null
+++ b/docs/examples/te_gemma/media/fp8_model_init.svg
@@ -0,0 +1 @@
+<svg width="1280" height="720" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="1280" height="720" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="24" transform="translate(151.097 204)">FP32/BF16</text><path d="M821 170 821 513.312" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="24" transform="translate(615.044 204)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="24" transform="translate(908.732 204)">FP8 with fp8_model_init()</text><rect x="868" y="326" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="883" y="363" width="101" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(920.957 382)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(911.87 398)">weight</text><rect x="1079" y="363" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1108.05 382)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1099.38 398)">GEMM</text><path d="M984 385 1073.04 385 1073.04 387 984 387ZM1071.71 382 1079.71 386 1071.71 390Z"/><path d="M1120 280.99 1120.73 356.404 1118.73 356.423 1118 281.01ZM1123.71 355.042 1119.79 363.08 1115.71 355.119Z"/><path d="M422 170 422 513.312" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/><rect x="54" y="326" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="68" y="351" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(105.39 374)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(91.6367 390)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(98.6367 406)">weight</text><rect x="271" y="224" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(308.18 247)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(294.427 263)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(306.347 279)">input</text><rect x="271" y="351" width="103" height="70" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(308.18 374)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(294.427 390)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(301.847 406)">GEMM</text><path d="M0.00641402-0.999979 93.1192-0.402739 93.1064 1.59722-0.00641402 0.999979ZM91.8051-3.41123 99.7793 0.64 91.7538 4.5886Z" transform="matrix(1 0 0 -1 171 386.64)"/><path d="M324 295 324 344.395 322 344.395 322 295ZM327 343.061 323 351.061 319 343.061Z"/><rect x="447" y="326" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="462" y="351" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(498.862 375)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(485.109 391)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(492.109 407)">weight</text><rect x="606" y="364" width="81" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(634.312 382)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(623.812 398)">Weight</text><rect x="703" y="234" width="96" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(738.66 252)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(734.494 268)">input</text><rect x="710" y="364" width="82" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(738.66 382)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(729.994 398)">GEMM</text><path d="M0.015735-0.999876 34.0184-0.464776 33.987 1.53498-0.015735 0.999876ZM32.7325-3.48538 40.6686 0.64 32.6066 4.51362Z" transform="matrix(1 0 0 -1 565 386.64)"/><path d="M687 385 702.791 385 702.791 387 687 387ZM701.458 382 709.458 386 701.458 390Z"/><path d="M752 279 752 357.156 750 357.156 750 279ZM755 355.822 751 363.822 747 355.822Z"/><rect x="1071" y="237" width="97" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1107.26 255)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1103.09 271)">input</text></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/media/fp8_model_init_1_half.svg b/docs/examples/te_gemma/media/fp8_model_init_1_half.svg
new file mode 100644
index 0000000000..3b217a3eb2
--- /dev/null
+++ b/docs/examples/te_gemma/media/fp8_model_init_1_half.svg
@@ -0,0 +1 @@
+<svg width="960" height="480" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="960" height="480" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="22" transform="translate(195.4 93)">FP32/BF16</text><path d="M461 61 461 404.312" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/><rect x="92" y="217" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="107" y="242" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(144.193 265)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(130.44 281)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(137.44 297)">weight</text><rect x="310" y="114" width="103" height="72" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(346.984 138)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(333.231 154)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(345.151 170)">input</text><rect x="310" y="242" width="103" height="70" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(346.984 265)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(333.23 281)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(340.65 297)">GEMM</text><path d="M0.00641402-0.999979 93.1192-0.402739 93.1064 1.59722-0.00641402 0.999979ZM91.8051-3.41123 99.7793 0.64 91.7538 4.5886Z" transform="matrix(1 0 0 -1 210 277.64)"/><path d="M362 186 362 235.395 360 235.395 360 186ZM365 234.061 361 242.061 357 234.061Z"/><rect x="486" y="217" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="501" y="242" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(537.665 266)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(523.912 282)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(530.912 298)">weight</text><rect x="644" y="255" width="82" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(673.115 273)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(662.615 289)">Weight</text><rect x="741" y="125" width="97" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(777.464 143)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(773.297 159)">input</text><rect x="749" y="255" width="82" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(777.464 273)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(768.797 289)">GEMM</text><path d="M0.015735-0.999876 34.0184-0.464776 33.987 1.53498-0.015735 0.999876ZM32.7325-3.48538 40.6686 0.64 32.6066 4.51362Z" transform="matrix(1 0 0 -1 604 277.64)"/><path d="M726 276 741.791 276 741.791 278 726 278ZM740.458 273 748.458 277 740.458 281Z"/><path d="M791 170 791 248.156 789 248.156 789 170ZM794 246.822 790 254.822 786 246.822Z"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="22" transform="translate(645.181 91)">FP8</text></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/media/fp8_model_init_2_half.svg b/docs/examples/te_gemma/media/fp8_model_init_2_half.svg
new file mode 100644
index 0000000000..46587664fe
--- /dev/null
+++ b/docs/examples/te_gemma/media/fp8_model_init_2_half.svg
@@ -0,0 +1 @@
+<svg width="960" height="480" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="960" height="480" fill="#FFFFFF"/><path d="M471 66 471 409.312" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="22" transform="translate(267.606 98)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="22" transform="translate(572.588 98)">FP8 with fp8_model_init()</text><rect x="519" y="222" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="533" y="259" width="101" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(571.603 277)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(562.516 293)">weight</text><rect x="730" y="259" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(758.696 277)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(750.029 293)">GEMM</text><path d="M634 280 723.041 280 723.041 282 634 282ZM721.708 277 729.708 281 721.708 285Z"/><path d="M771 176.99 771.726 252.404 769.726 252.423 769 177.01ZM774.713 251.042 770.79 259.08 766.713 251.119Z"/><rect x="98" y="222" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="112" y="246" width="104" height="72" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(149.508 270)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(135.755 286)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(142.755 302)">weight</text><rect x="256" y="259" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(284.957 277)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(274.457 293)">Weight</text><rect x="353" y="130" width="97" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(389.306 148)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(385.139 164)">input</text><rect x="361" y="259" width="81" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(389.306 277)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(380.639 293)">GEMM</text><path d="M0.015735-0.999876 34.0184-0.464776 33.987 1.53498-0.015735 0.999876ZM32.7325-3.48538 40.6686 0.64 32.6066 4.51362Z" transform="matrix(1 0 0 -1 216 281.64)"/><path d="M338 280 353.791 280 353.791 282 338 282ZM352.458 277 360.458 281 352.458 285Z"/><path d="M402 174 402 252.156 400 252.156 400 174ZM405 250.822 401 258.822 397 250.822Z"/><rect x="722" y="132" width="96" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(757.906 151)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(753.739 167)">input</text></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/media/generation_animation.gif b/docs/examples/te_gemma/media/generation_animation.gif
new file mode 100644
index 0000000000..25150cb9b6
Binary files /dev/null and b/docs/examples/te_gemma/media/generation_animation.gif differ
diff --git a/docs/examples/te_gemma/media/graphs.svg b/docs/examples/te_gemma/media/graphs.svg
new file mode 100644
index 0000000000..f734637e6d
--- /dev/null
+++ b/docs/examples/te_gemma/media/graphs.svg
@@ -0,0 +1 @@
+<svg width="1280" height="720" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="1280" height="720" fill="#FFFFFF"/><path d="M645 209 645 446.818" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="24" transform="translate(201.111 246)">Without CUDA Graphs</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="24" transform="translate(855.749 246)">With CUDA Graphs</text><rect x="64" y="319" width="91" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#F2F2F2"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(75.6135 349)">Launch 1</text><rect x="155" y="371" width="90" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(169.288 401)">Kernel 1</text><rect x="245" y="319" width="91" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#F2F2F2"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(256.462 349)">Launch 2</text><rect x="336" y="371" width="90" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(350.136 401)">Kernel 2</text><rect x="426" y="319" width="91" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#F2F2F2"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(437.31 349)">Launch 3</text><rect x="517" y="371" width="90" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(530.984 401)">Kernel 3</text><path d="M47 368 621.291 368 621.291 372 47 372ZM619.291 364 631.291 370 619.291 376Z"/><rect x="680" y="319" width="145" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#F2F2F2"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(694.058 349)">Launch Graph 1</text><rect x="830" y="370" width="91" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(844.463 400)">Kernel 1</text><rect x="924" y="370" width="90" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(938.451 400)">Kernel 2</text><rect x="1018" y="370" width="90" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(1032.44 400)">Kernel 3</text><path d="M663 368 1237.29 368 1237.29 372 663 372ZM1235.29 364 1247.29 370 1235.29 376Z"/></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/media/graphs_1.png b/docs/examples/te_gemma/media/graphs_1.png
new file mode 100644
index 0000000000..f42b50fe0d
Binary files /dev/null and b/docs/examples/te_gemma/media/graphs_1.png differ
diff --git a/docs/examples/te_gemma/media/graphs_2.png b/docs/examples/te_gemma/media/graphs_2.png
new file mode 100644
index 0000000000..35c34ede55
Binary files /dev/null and b/docs/examples/te_gemma/media/graphs_2.png differ
diff --git a/docs/examples/te_gemma/media/plot.svg b/docs/examples/te_gemma/media/plot.svg
new file mode 100644
index 0000000000..481f156df6
--- /dev/null
+++ b/docs/examples/te_gemma/media/plot.svg
@@ -0,0 +1 @@
+<svg width="1280" height="720" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="1280" height="720" fill="#FFFFFF"/><g><path d="M203.5 526.5 1109.5 526.5M203.5 479.5 1109.5 479.5M203.5 431.5 1109.5 431.5M203.5 384.5 1109.5 384.5M203.5 337.5 1109.5 337.5M203.5 289.5 1109.5 289.5M203.5 242.5 1109.5 242.5M203.5 195.5 1109.5 195.5M203.5 147.5 1109.5 147.5M203.5 100.5 1109.5 100.5" stroke="#D9D9D9" stroke-linejoin="round" stroke-miterlimit="10" fill="none"/></g><g><path d="M265 159 322 159 322 574 265 574ZM447 318 503 318 503 574 447 574ZM628 440 685 440 685 574 628 574ZM809 495 866 495 866 574 809 574ZM990 517 1047 517 1047 574 990 574Z" fill="#76B900"/></g><g><path d="M203.5 574.5 1109.5 574.5" stroke="#D9D9D9" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/></g><g><text fill="#404040" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(275.188 149)">87.68 s</text></g><g><text fill="#404040" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(456.403 308)">54.11 s</text></g><g><text fill="#404040" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(637.619 431)">28.22 s</text></g><g><text fill="#404040" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(818.835 485)">16.75 s</text></g><g><text fill="#404040" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(1000.05 507)">12.13 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(177.491 577)">0 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 530)">10 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 482)">20 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 435)">30 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 388)">40 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 340)">50 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 293)">60 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 246)">70 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 198)">80 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 151)">90 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(164.664 103)">100 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(259.651 593)">HF (baseline)</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(430.297 593)">TE (subsitution of</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(405.697 607)">GemmaDecoderLayer with</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(420.753 622)">te.TransformerLayer)</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(608.469 593)">TE + THD attention</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(747.892 593)">TE + THD attention + CUDA Graphs</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(955.438 593)">TE + THD attention + FP8</text></g></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/media/thd_bshd.svg b/docs/examples/te_gemma/media/thd_bshd.svg
new file mode 100644
index 0000000000..47eed69565
--- /dev/null
+++ b/docs/examples/te_gemma/media/thd_bshd.svg
@@ -0,0 +1 @@
+<svg width="3840" height="2160" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="3840" height="2160" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="96" transform="translate(664.716 313)">BSHD Layout</text><path d="M1920 459 1920 1991.8" stroke="#000000" stroke-width="8" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><rect x="128.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="365.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="246.5" y="1356.5" width="80" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="484.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="128.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="365.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="246.5" y="1472.5" width="80" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="484.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="128.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="365.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="246.5" y="1587.5" width="80" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="484.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="128.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="365.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="246.5" y="1701.5" width="80" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="484.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="96" transform="translate(294.581 1311)">Q</text><rect x="742.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="979.5" y="1356.5" width="78.9999" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="860.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1098.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="742.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="979.5" y="1472.5" width="78.9999" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="860.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1098.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="742.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="979.5" y="1587.5" width="78.9999" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="860.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1098.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="742.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="979.5" y="1701.5" width="78.9999" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="860.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1098.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="96" transform="translate(912.066 1311)">K</text><rect x="1306.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1544.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1425.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1662.5" y="1356.5" width="80" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1306.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1544.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1425.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1662.5" y="1472.5" width="80" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1306.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1544.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1425.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1662.5" y="1587.5" width="80" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1306.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1544.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1425.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1662.5" y="1701.5" width="80" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="96" transform="translate(1477.22 1311)">V</text><rect x="2148.5" y="1292.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2326.5" y="1293.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2237.5" y="1293.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2414.5" y="1293.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="96" transform="translate(2028.54 1358)">Q</text><rect x="2501.5" y="1292.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2678.5" y="1292.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2589.5" y="1292.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2767.5" y="1292.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2853.5" y="1293.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3031.5" y="1294.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2942.5" y="1294.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3119.5" y="1294.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3206.5" y="1292.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3383.5" y="1293.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3294.5" y="1293.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3472.5" y="1293.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2148.5" y="1418.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2326.5" y="1419.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2237.5" y="1419.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2414.5" y="1419.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="96" transform="translate(2032.21 1484)">K</text><rect x="2501.5" y="1418.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2678.5" y="1418.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2589.5" y="1418.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2767.5" y="1418.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2853.5" y="1419.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3031.5" y="1420.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2942.5" y="1420.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3119.5" y="1420.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3206.5" y="1418.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3383.5" y="1419.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3294.5" y="1419.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3472.5" y="1419.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2148.5" y="1544.5" width="52" height="64" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2326.5" y="1545.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2237.5" y="1545.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2414.5" y="1545.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="96" transform="translate(2032.71 1610)">V</text><rect x="2501.5" y="1544.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2678.5" y="1544.5" width="52" height="64" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2589.5" y="1544.5" width="52" height="64" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2767.5" y="1544.5" width="52" height="64" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2853.5" y="1545.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3031.5" y="1546.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2942.5" y="1546.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3119.5" y="1546.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3206.5" y="1544.5" width="52" height="64" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3383.5" y="1545.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3294.5" y="1545.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3472.5" y="1545.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="53" transform="translate(2063.07 648)">Cumulative sequence lengths:</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(2841.94 648)">3, 3 + 1, 3 + 1 + 3, 3 + 1 + 3 + 1</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="53" transform="translate(2442.44 771)">Sequence offsets:</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(2916.28 771)">0, 4, 8, 12</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(318.26 1982)">[</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="80" transform="translate(348.26 1982)">b</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(397.093 1982)">atch_size,</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="80" transform="translate(781.793 1982)">s</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(823.293 1982)">eq_len,</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="80" transform="translate(1105.46 1982)">h</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(1153.79 1982)">ead_nr,</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="80" transform="translate(1444.04 1982)">d</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(1492.87 1982)">im]</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(2232.79 1982)">[</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="80" transform="translate(2262.79 1982)">t</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(2298.63 1982)">otal_nr_token</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(2816.33 1982)">s</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(2856.99 1982)">,</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="80" transform="translate(2895.66 1982)">h</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(2943.99 1982)">ead_nr</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(3195.57 1982)">,</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="80" transform="translate(3234.24 1982)">d</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(3283.07 1982)">im]</text><path d="M2142 1283C2142 1262.57 2142 1246 2142 1246L2268.96 1246C2268.96 1246 2268.96 1229.43 2268.96 1209 2268.96 1229.43 2268.96 1246 2268.96 1246L2383 1246C2383 1246 2383 1262.57 2383 1283" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><path d="M2470 1617C2470 1637.43 2470 1654 2470 1654L2439.9 1654C2439.9 1654 2439.9 1670.57 2439.9 1691 2439.9 1670.57 2439.9 1654 2439.9 1654L2408 1654C2408 1654 2408 1637.43 2408 1617" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><path d="M2848 1283C2848 1262.57 2848 1246 2848 1246L2974.96 1246C2974.96 1246 2974.96 1229.43 2974.96 1209 2974.96 1229.43 2974.96 1246 2974.96 1246L3089 1246C3089 1246 3089 1262.57 3089 1283" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><path d="M3201 1283C3201 1262.57 3201 1246 3201 1246L3233.66 1246C3233.66 1246 3233.66 1229.43 3233.66 1209 3233.66 1229.43 3233.66 1246 3233.66 1246L3263 1246C3263 1246 3263 1262.57 3263 1283" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(2199.97 1195)">Seq. 1</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(2457.48 1194)">Seq. 2</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(3162.02 1196)">Seq. 4</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(2904.51 1199)">Seq. 3</text><path d="M575 1778C575 1789.32 572.436 1798.5 569.274 1798.5L335.242 1798.5C332.079 1798.5 329.516 1807.68 329.516 1819 329.516 1807.68 326.952 1798.5 323.79 1798.5L114.726 1798.5C111.564 1798.5 109 1789.32 109 1778" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><path d="M1749 1333C1760.32 1333 1769.5 1335.56 1769.5 1338.73L1769.5 1572.76C1769.5 1575.92 1778.68 1578.48 1790 1578.48 1778.68 1578.48 1769.5 1581.05 1769.5 1584.21L1769.5 1793.27C1769.5 1796.44 1760.32 1799 1749 1799" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="53" transform="translate(317.331 1867)">s</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="53" transform="translate(1801.9 1594)">b</text><path d="M3531 1756C3531 1767.6 3528.37 1777 3525.13 1777L2796.2 1777C2792.96 1777 2790.33 1786.4 2790.33 1798 2790.33 1786.4 2787.71 1777 2784.47 1777L2130.87 1777C2127.63 1777 2125 1767.6 2125 1756" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="53" transform="translate(2786.19 1862)">t</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="96" transform="translate(2550.83 313)">THD Layout</text><path d="M2497 1278C2497 1257.57 2497 1241 2497 1241L2529.66 1241C2529.66 1241 2529.66 1224.43 2529.66 1204 2529.66 1224.43 2529.66 1241 2529.66 1241L2559 1241C2559 1241 2559 1257.57 2559 1278" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(2375.05 1738)">Pad. 1</text><path d="M2825 1617C2825 1636.33 2825 1652 2825 1652L2708.01 1652C2708.01 1652 2708.01 1667.67 2708.01 1687 2708.01 1667.67 2708.01 1652 2708.01 1652L2584 1652C2584 1652 2584 1636.33 2584 1617" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(2638.24 1734)">Pad. 2</text><path d="M3531 1622C3531 1641.05 3531 1656.5 3531 1656.5L3414.01 1656.5C3414.01 1656.5 3414.01 1671.95 3414.01 1691 3414.01 1671.95 3414.01 1656.5 3414.01 1656.5L3290 1656.5C3290 1656.5 3290 1641.05 3290 1622" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(3331.41 1741)">Pad. 4</text><path d="M3182 1618C3182 1636.5 3182 1651.5 3182 1651.5L3146.56 1651.5C3146.56 1651.5 3146.56 1666.5 3146.56 1685 3146.56 1666.5 3146.56 1651.5 3146.56 1651.5L3109 1651.5C3109 1651.5 3109 1636.5 3109 1618" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(3071.62 1731)">Pad. 3</text><rect x="1382.5" y="464.5" width="428" height="354" stroke="#000000" stroke-width="2.66667" stroke-linecap="square" stroke-linejoin="round" stroke-miterlimit="10" stroke-dasharray="8 2.66667" fill="#FFFFFF"/><rect x="509.5" y="417.5" width="736" height="614" stroke="#000000" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="Trebuchet MS,Trebuchet MS_MSFontService,sans-serif" font-weight="400" font-size="64" transform="translate(658.871 507)">Attention mask</text><rect x="657.5" y="550.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="895.5" y="550.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="776.5" y="550.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="1014.5" y="550.5" width="78.9999" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="657.5" y="666.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="895.5" y="666.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="776.5" y="666.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="1014.5" y="666.5" width="78.9999" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="657.5" y="781.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="895.5" y="781.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="776.5" y="781.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="1014.5" y="781.5" width="78.9999" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="657.5" y="895.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="895.5" y="895.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="776.5" y="895.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="1014.5" y="895.5" width="78.9999" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="1425.5" y="542.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="1425.5" y="666.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(1555.66 601)">token</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(1546.62 725)">padding</text></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/requirements.txt b/docs/examples/te_gemma/requirements.txt
new file mode 100644
index 0000000000..c90fb6dad0
--- /dev/null
+++ b/docs/examples/te_gemma/requirements.txt
@@ -0,0 +1,4 @@
+transformers==4.41.1
+accelerate==0.30.1
+datasets==2.19.1
+sentencepiece==0.2.0
\ No newline at end of file
diff --git a/docs/examples/te_gemma/te_gemma.py b/docs/examples/te_gemma/te_gemma.py
new file mode 100644
index 0000000000..758f77219f
--- /dev/null
+++ b/docs/examples/te_gemma/te_gemma.py
@@ -0,0 +1,476 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+from contextlib import contextmanager
+
+from typing import Optional
+
+import torch
+import transformer_engine as te
+from transformer_engine.pytorch.attention import InferenceParams, RotaryPositionEmbedding
+from transformer_engine.common.recipe import Format, DelayedScaling
+from torch.cuda.amp import autocast
+
+import transformers
+from transformers.models.gemma.modeling_gemma import GemmaForCausalLM, GemmaConfig, GemmaModel
+
+import torch.nn.functional as F
+
+
+class TEGemmaDecoderLayer(te.pytorch.TransformerLayer):
+    """
+    Wrapper class over TE's `TransformerLayer`. This makes the wrapper very
+    similar to HF's `GemmaDecoderLayer` and easier to replace it in the code.
+
+    Args:
+        config: GemmaConfig
+        args: positional args (for compatibility with `GemmaDecoderLayer`)
+        kwargs: keyword args (for compatibility with `GemmaDecoderLayer`)
+    """
+
+    def __init__(self, config: GemmaConfig, layer_idx: int, *args, **kwargs):
+        super().__init__(
+            hidden_size=config.hidden_size,
+            ffn_hidden_size=config.intermediate_size,
+            num_attention_heads=config.num_attention_heads,
+            bias=False,
+            layernorm_epsilon=config.rms_norm_eps,
+            hidden_dropout=0,
+            attention_dropout=0,
+            fuse_qkv_params=config.fuse_qkv_params,
+            normalization="RMSNorm",
+            activation="geglu",
+            attn_input_format=config.qkv_format,
+            num_gqa_groups=config.num_key_value_heads,
+            kv_channels=256,
+            layer_number=(
+                layer_idx + 1
+            ),  # Layer numbers in TE starts from 1, not 0 like in the HF.
+            zero_centered_gamma=True,
+        )
+        self.te_rope_emb = RotaryPositionEmbedding(256)(
+            max_seq_len=config.max_position_embeddings
+        ).cuda()
+
+    def forward(self, *args, **kwargs):  # We need to additionally pass positional encoding.
+        # this args cannot be passed to TransformerLayer
+        keys_to_remove = [
+            "position_ids",
+            "past_key_value",
+            "output_attentions",
+            "use_cache",
+            "cache_position",
+        ]
+        for key in keys_to_remove:
+            kwargs.pop(key, None)
+        # We need to return tuple to be compatible with HF.
+        return (super().forward(*args, rotary_pos_emb=self.te_rope_emb, **kwargs),)
+
+
+class StaticGemmaModel(torch.nn.Module):
+    """
+    StaticGemma is based of HF GemmaModel class.
+    It is adjusted to work properly with CUDA Graphs.
+    """
+
+    def __init__(
+        self,
+        model: GemmaModel,
+        dtype: torch.dtype,
+        mask: torch.Tensor,
+        lm_head: torch.nn.Module,
+    ):
+        super().__init__()
+        self.model = model
+        self.normalizer = torch.tensor(self.model.config.hidden_size**0.5, dtype=dtype)
+        self.mask = mask
+        self.lm_head = lm_head
+
+    def set_inference_params(self, inference_params):
+        self.inference_params = inference_params
+
+    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = None):
+        with torch.no_grad():
+            # static operation - for CUDA graphs
+            hidden_states.data[:] = hidden_states.data[:] * self.normalizer
+            for decoder_layer in self.model.layers:
+                hidden_states.data[:] = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    self_attn_mask_type=self.mask,
+                    inference_params=self.inference_params,
+                )[
+                    0
+                ]  # static copy - for CUDA graphs
+
+        hidden_states.copy_(self.model.norm(hidden_states))  # static copy - for CUDA graphs
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        return logits
+
+
+class GemmaGenerator(torch.nn.Module):
+    """
+    GemmaGenerator gets one layer of embeddins,
+    makes forward pass and returns next tokens.
+    """
+
+    def __init__(
+        self, model: GemmaModel, lm_head: torch.nn.Module, dtype: torch.dtype, qkv_format: str
+    ):
+        super().__init__()
+        self.model = model
+        self.gemma_layers = StaticGemmaModel(model, dtype, "padding", lm_head)
+        self.qkv_format = qkv_format
+
+    def set_inference_params(self, inference_params):
+        self.inference_params = inference_params
+        self.gemma_layers.set_inference_params(inference_params)
+
+    def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor = None):
+        logits = self.gemma_layers(hidden_states, attention_mask=mask)
+
+        assert logits.shape[0] == hidden_states.shape[0]  # b
+        assert logits.shape[1] == hidden_states.shape[1]  # seq_len
+        # logits.shape[2] = number of tokens
+        logits = logits[:, -1, :]
+        next_tokens = torch.argmax(logits, dim=1)
+
+        # static copy for CUDA graphs
+        hidden_states.copy_(self.model.embed_tokens(next_tokens).unsqueeze(1))
+
+        # self.inference_params contains for example kv_cache.
+        # This needs to be called before every pass,
+        # to update the information of sequence lengths.
+        # Here we increase sequence offsets by one,
+        # because we generated one token for every sequence.
+        if self.qkv_format == "thd":
+            self.inference_params.setup_before_new_input(
+                lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"),
+                max_input_length=1,
+            )
+        else:
+            self.inference_params.setup_before_new_input(length=1)
+
+        return next_tokens
+
+
+@contextmanager
+def replace_decoder(te_decoder_cls):
+    """
+    Replace `GemmaDecoderLayer` with custom `TEGemmaDecoderLayer`.
+    """
+    original_gemma_decoder_cls = transformers.models.gemma.modeling_gemma.GemmaDecoderLayer
+    transformers.models.gemma.modeling_gemma.GemmaDecoderLayer = te_decoder_cls
+    try:
+        yield
+    finally:
+        transformers.models.gemma.modeling_gemma.GemmaDecoderLayer = original_gemma_decoder_cls
+
+
+class TEGemmaForCausalLM(GemmaForCausalLM):
+    """
+    Causal LM created with `GemmaModel`. The underlying `GemmaDecoderLayer`
+    class is monkey-patched with `TEGemmaDecoderLayer` class before
+    initializing the causal LM with `GemmaForCausalLM`.
+
+    Args:
+        config: GemmaConfig
+    """
+
+    def __init__(self, config: GemmaConfig):
+        with replace_decoder(te_decoder_cls=TEGemmaDecoderLayer):
+            super().__init__(config)
+        self.to(torch.bfloat16).cuda()
+        self.hidden_size = config.hidden_size
+        self._model_generation_phase = GemmaGenerator(
+            lm_head=self.lm_head,
+            model=self.model,
+            dtype=torch.bfloat16,
+            qkv_format=config.qkv_format,
+        )
+        self._model_context_phase = StaticGemmaModel(
+            self.model, torch.bfloat16, "padding_causal", self.lm_head
+        )
+
+        if self.config.fp8:
+            self.fp8_recipe = DelayedScaling(
+                fp8_format=Format.HYBRID, amax_history_len=16, amax_compute_algo="max"
+            )
+
+    @staticmethod
+    def _padding_to_end(inputs, lengths):
+        """
+        Gets the tensor with sequence padded from the beginning and
+        return tensor padded from its end.
+
+        Parameters
+        ----------
+        inputs : Tensor, tensor with shape [b, s] containing token numbers.
+                 It's padded from the beggining.
+        lengths: Tensor, tensor with shape [s] with lengths of the sequences.
+
+        """
+        max_seq_len = torch.max(lengths)
+        batch_size, max_seq_len = inputs.shape
+        new_input_ids = inputs.clone()
+        for i in range(batch_size):
+            new_input_ids[i, : lengths[i]] = inputs[i, (max_seq_len - lengths[i]) : max_seq_len]
+            new_input_ids[i, lengths[i] :] = inputs[i, 0 : (max_seq_len - lengths[i])]
+        inputs.copy_(new_input_ids)
+
+    def _next_64_multiply(self, x):
+        return ((x + 63) // 64) * 64
+
+    # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
+    def _create_hidden_states_buffer(self, input_ids: torch.Tensor):
+        return torch.empty(
+            (input_ids.shape[0], input_ids.shape[1], self.hidden_size),
+            device="cuda",
+            dtype=torch.float32,
+        )
+
+    # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
+    def _create_inference_params(self, max_batch_size: int, max_sequence_length: int):
+        return InferenceParams(
+            max_batch_size, max_sequence_length, qkv_format=self.config.qkv_format
+        )
+
+    # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
+    def _get_max_input_seq_len(self, input_ids):
+        return input_ids.shape[1]
+
+    # The buffer for generation is some part (beginning) of hidden states buffer.
+    # This function returns pointer to it and also copies there data if provided.
+    def _get_generation_buffer(self, hidden_states_buffer, data_to_copy=None):
+        # hidden_states_buffer has shape [b, s, hd]
+        # generation_buffer will have shape [b, 1, hd]
+        # Notice that "generation_buffer = hidden_states_buffer[:, 0, :].unsqueeze(1)"
+        # will return uncontiguous buffer, which we want to avoid.
+        output = hidden_states_buffer.view(-1)[
+            : hidden_states_buffer.shape[0] * hidden_states_buffer.shape[2]
+        ]
+        if data_to_copy is not None:
+            output.copy_(data_to_copy.reshape(-1))
+        generation_buffer = output.view(
+            (hidden_states_buffer.shape[0], 1, hidden_states_buffer.shape[2])
+        )
+        return generation_buffer
+
+    def _generate_context_phase(self, input_ids: torch.Tensor, inference_params: InferenceParams):
+        hidden_states = self._create_hidden_states_buffer(input_ids)
+        hidden_states.data[:] = self.model.embed_tokens(input_ids)
+
+        # We need to update offsets before every forward pass to make cache work properly.
+        lengths = input_ids.ne(0).sum(dim=1)
+        if self.config.qkv_format == "thd":
+            inference_params.setup_before_new_input(
+                lengths_tensor=lengths, max_input_length=input_ids.shape[1]
+            )
+        else:
+            inference_params.setup_before_new_input(length=input_ids.shape[1])
+
+        hidden_states.data[:] = self.model.embed_tokens(input_ids)
+        logits = self._model_context_phase(
+            hidden_states,
+            attention_mask=((input_ids == 0) if self.config.qkv_format != "thd" else None),
+        )
+
+        # We choose logits coresponding with last token in each sequence,
+        # which have various lengths - they are stored in (inference_params.incoming_seq_len - 1)
+        # Tensor when qkv_format == "thd" and
+        # they are the last token in the sequence when qkv_format != "thd".
+        if self.config.qkv_format == "thd":
+            logits = logits[
+                torch.arange(logits.size(0)), inference_params.input_sequence_lengths - 1, :
+            ]
+        else:
+            logits = logits[:, -1, :]
+        next_tokens = torch.argmax(logits, dim=1)
+
+        # self.hidden_states have shape [b, s, hd].
+        # We return hidden state for the last token - output has shape [b, 1, hd]
+        hidden_states = self._get_generation_buffer(
+            hidden_states, self.model.embed_tokens(next_tokens)
+        )
+        return hidden_states, next_tokens
+
+    def _make_mask_one_token_longer(self, mask):
+        return torch.cat(
+            [mask, torch.zeros(mask.size(0), 1, 1, 1, dtype=torch.bool, device=mask.device)], dim=-1
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        pad_token_id: int = 0,
+        max_new_tokens: int = 0,
+        *args,
+        **kwargs
+    ):
+        self.eval()
+
+        # We need both autocasts: FP8 for operations that can run in lower precision
+        # and BF16 for those that cannot.
+        with autocast(dtype=torch.bfloat16, cache_enabled=False), te.pytorch.fp8_autocast(
+            enabled=self.config.fp8, fp8_recipe=self.fp8_recipe if self.config.fp8 else None
+        ):
+
+            batch_size, max_input_sequence_len = input_ids.shape[0], self._get_max_input_seq_len(
+                input_ids
+            )
+            lengths = torch.sum(input_ids.ne(pad_token_id), dim=-1).squeeze()  # [s]
+            input_ids = F.pad(
+                input_ids, (max_input_sequence_len - input_ids.shape[1], 0), "constant", 0
+            )
+
+            # InferenceParams is a cache, where keys and values of previous tokens are stored.
+            # Moreover it stores length of both already generated and input sequences.
+            inference_params = self._create_inference_params(
+                max_batch_size=batch_size,
+                max_sequence_length=self._next_64_multiply(max_input_sequence_len + max_new_tokens),
+            )
+
+            self._model_context_phase.set_inference_params(inference_params)
+            self._model_generation_phase.set_inference_params(inference_params)
+
+            if self.config.qkv_format == "thd":
+                # For thd layout padding is at the end, otherwise at the beginning.
+                TEGemmaForCausalLM._padding_to_end(input_ids, lengths)
+
+            hidden_states, next_tokens = self._generate_context_phase(input_ids, inference_params)
+
+            # Generation phase.
+            if self.config.qkv_format == "thd":
+                inference_params.setup_before_new_input(
+                    lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"),
+                    max_input_length=1,
+                )
+            else:
+                inference_params.setup_before_new_input(length=1)
+
+            output_tokens = [next_tokens]
+
+            mask = None
+            if self.config.qkv_format != "thd":
+                mask = (input_ids == 0).unsqueeze(1).unsqueeze(1)
+
+            for _ in range(max_new_tokens):
+                if self.config.qkv_format != "thd":
+                    # It will not work with cuda graphs, but it is not used for thd qkv_format.
+                    mask = self._make_mask_one_token_longer(mask)
+
+                next_tokens = self._model_generation_phase(hidden_states, mask)
+                # next_tokens is static output tensor, so we need to clone it
+                # - it gets changed every iteration.
+                output_tokens.append(next_tokens.clone())
+
+            result = torch.cat((input_ids, torch.stack(output_tokens).permute([1, 0])), dim=1)
+            return result
+
+
+class TEGemmaForCausalLMCudaGraphs(TEGemmaForCausalLM):
+    """
+    TEGemmaForCausalLMCudaGraphs is the version of the class TEGemmaForCausalLM
+    using CUDA Graphs to speed it up. We need to make one trade-off.
+    Namely, batch_size, max_seq_len and max_context_seq_len need to be static.
+    It is necessary to run generation with the same value of
+    these variables that we recorded graph on.
+    """
+
+    def __init__(self, config: GemmaConfig):
+        super().__init__(config)
+        assert (
+            config.qkv_format == "thd"
+        ), "Generation with CUDA Graphs are implemented only for thd format."
+
+        # Preparation of the static buffers.
+        self.config = config
+        self.hidden_states_buffer = torch.empty(
+            (
+                config.cuda_graphs_static_batch_size,
+                config.cuda_graphs_static_max_context_len,
+                config.hidden_size,
+            )
+        ).cuda()
+        # This is in fact part of the buffer for hidden_states.
+        self.generation_buffer = self._get_generation_buffer(self.hidden_states_buffer)
+        self.inference_params = InferenceParams(
+            max_batch_size=config.cuda_graphs_static_batch_size,
+            max_sequence_length=config.cuda_graphs_static_max_seq_len,
+            qkv_format="thd",
+        )
+
+        self._model_generation_phase.set_inference_params(self.inference_params)
+        self._model_context_phase.set_inference_params(self.inference_params)
+
+    def record(self):
+        # We want to record model in training=False, because it will be used in generation.
+        self.eval()
+
+        # Here "the trick" happens. We override methods from TEGemmaForCausalLM
+        # with their recorded version. After invocation of each of them,
+        # captured graph will be replayed with minimal usage of CPU,
+        # what will lead to huge speedup.
+        input_shape = (
+            self.config.cuda_graphs_static_batch_size,
+            self.config.cuda_graphs_static_max_context_len,
+        )
+        self.inference_params.reset()
+        self.inference_params.setup_before_new_input(
+            lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"),
+            max_input_length=input_shape[1],
+        )
+        self._model_context_phase = self.record_graph(
+            self._model_context_phase, self.hidden_states_buffer
+        )  # CUDA Graphs recording
+
+        input_shape = (self.config.cuda_graphs_static_batch_size, 1)
+        self.inference_params.reset()
+        self.inference_params.setup_before_new_input(
+            lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"),
+            max_input_length=input_shape[1],
+        )
+        self._model_generation_phase = self.record_graph(
+            self._model_generation_phase, self.generation_buffer
+        )  # CUDA Graphs recording
+
+    """
+        Functions _create_hidden_states_buffer and _create_inference_params
+        from base class are overriden to make hidden_states and inference_params static
+        - not changing their position in memory between every invocation.
+    """
+
+    def _create_hidden_states_buffer(self, *args, **kwargs):
+        return self.hidden_states_buffer
+
+    def _create_inference_params(self, *args, **kwargs):
+        self.inference_params.reset()
+        return self.inference_params
+
+    def _get_max_input_seq_len(self, _):
+        return self.config.cuda_graphs_static_max_context_len
+
+    @torch.no_grad()
+    def record_graph(self, function, input_tensor):
+        # function is invoked on argument (self.hidden_states,) and all kernels are recorded.
+        # record_graph() returns captured function, which can be run later with lower of th CPU.
+        fp8_format = Format.HYBRID
+        fp8_recipe = DelayedScaling(
+            fp8_format=fp8_format, amax_history_len=1024, amax_compute_algo="max"
+        )
+
+        # We need both autocasts: FP8 for operations that can run in lower precision
+        # and BF16 for those that cannot.
+        with autocast(dtype=torch.bfloat16, cache_enabled=False):
+            graphed_function = te.pytorch.make_graphed_callables(
+                function,
+                (input_tensor,),
+                fp8_enabled=self.config.fp8,
+                fp8_recipe=fp8_recipe,
+                allow_unused_input=True,
+                num_warmup_iters=3,
+            )
+        return graphed_function
diff --git a/docs/examples/te_gemma/te_gemma_loading_weights.py b/docs/examples/te_gemma/te_gemma_loading_weights.py
new file mode 100644
index 0000000000..87e6667a9b
--- /dev/null
+++ b/docs/examples/te_gemma/te_gemma_loading_weights.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import os
+import re
+import gc
+import torch
+
+from typing import List
+
+from transformer_engine.pytorch.fp8 import fp8_model_init
+
+from transformers.modeling_utils import load_state_dict, _load_state_dict_into_model
+from transformers.utils.hub import get_checkpoint_shard_files
+
+"""
+    This file contains logic of mapping the HuggingFace GemmaModel parameters
+    with TransformerEngine TransformerLayer. When we have initialized Transformer models
+    both with HF and with TE, we can copy parameters from the first to the second.
+"""
+
+
+def _load_weights_for_fp8_model(vanilla_model, hyperparams):
+    # The weights are loaded from the file with state_dict
+    # of model with weights which contains also fp8 parameters.
+    # The weights are in BF16 precision, but they contain fp8 metadata
+    # computed by the calibration procedure.
+    vanilla_model.load_state_dict(
+        torch.load(hyperparams.fp8_model_weights_filename),
+        strict=False,
+        # strict = false, because some parameters have
+        # multiple pointers to the same weight
+        # vanilla_model._model_context_phase.model
+        # and vanilla_model._model_generation_phase.model
+    )
+
+
+def _load_weights_for_standard_model(vanilla_model, config):
+    # The weights are loaded from the file with original weights.
+    archive_file = os.path.join(config.model_name, "model.safetensors.index.json")
+    resolved_archive_file, _ = get_checkpoint_shard_files(config.model_name, archive_file)
+    total_dict = {}
+    for shard_file in resolved_archive_file:
+        state_dict = load_state_dict(shard_file)
+        total_dict.update(state_dict)
+
+    replace_params(
+        total_dict,
+        vanilla_model.state_dict(),
+        config,
+        qkv_fused_and_interleaved=config.fuse_qkv_params,
+    )
+    # Copy parameters like embedding:
+    _load_state_dict_into_model(vanilla_model, total_dict, start_prefix="")
+
+    # Force mem release. Taken from huggingface code.
+    del total_dict
+    gc.collect()
+
+
+def load_te_model(cls, config):
+    """
+    Custom method adapted from `from_pretrained` method in HuggingFace
+    Transformers repo:
+    https://github.com/huggingface/transformers/blob/f497f564bb76697edab09184a252fc1b1a326d1e/src/transformers/modeling_utils.py#L2579
+    """
+    config.use_cache = False  # To make TransformerLayer compatible with GemmaModel
+    with fp8_model_init(config.fp8_model_init):
+        # there we need only to create model
+        vanilla_model = cls(config).to(torch.bfloat16).cuda()
+
+    # and now we copy the weights into it
+    if config.fp8_model_weights_filename is not None:
+        _load_weights_for_fp8_model(vanilla_model, config)
+    else:
+        _load_weights_for_standard_model(vanilla_model, config)
+
+    return vanilla_model
+
+
+def _get_all_layer_prefixes_to_update(hf_state_dict):
+    """
+    There are many parameters in hf_state_dict, whose name start with "model.layers.[number]."
+    This function extracts all strings like "model.layers.[number]."
+    that are starting strings of keys in hf_state_dict.
+    """
+    all_layer_prefixes = set()
+    for param_key in hf_state_dict.keys():
+        layer_prefix_pat = "model.layers.\d+."
+        m = re.match(layer_prefix_pat, param_key)
+        if m is not None:
+            all_layer_prefixes.add(m.group())
+    return all_layer_prefixes
+
+
+def replace_params(hf_state_dict, te_state_dict, config, qkv_fused_and_interleaved=False):
+    """
+    Replaces params from TE TransformerLayer state_dict with corresponding parameters
+    from HuggingFace GemmaModel state_dict.
+    """
+    all_layer_prefixes: List[str] = _get_all_layer_prefixes_to_update(hf_state_dict)
+
+    for layer_prefix in all_layer_prefixes:
+
+        def copy_from_ht_to_te(te_name, hf_name, start=None, end=None):
+            te_state_dict[layer_prefix + te_name].data[start:end].copy_(
+                hf_state_dict[layer_prefix + hf_name]
+            )
+
+        copy_from_ht_to_te(
+            "self_attention.layernorm_qkv.layer_norm_weight", "input_layernorm.weight"
+        )
+        copy_from_ht_to_te("self_attention.proj.weight", "self_attn.o_proj.weight")
+        copy_from_ht_to_te("layernorm_mlp.layer_norm_weight", "post_attention_layernorm.weight")
+        copy_from_ht_to_te("layernorm_mlp.fc2_weight", "mlp.down_proj.weight")
+        copy_from_ht_to_te(
+            "layernorm_mlp.fc1_weight", "mlp.gate_proj.weight", end=config.intermediate_size
+        )
+        copy_from_ht_to_te(
+            "layernorm_mlp.fc1_weight", "mlp.up_proj.weight", start=config.intermediate_size
+        )
+
+        if qkv_fused_and_interleaved:
+            """
+            When qkv_fused_and_interleaved=True, key, query and value layers are on one tensor
+            in TE TransformerLayer. Moreover they are interleaved within each head.
+            Let q_i, k_i and v_i be query, key and value layers for i-th head respectively.
+            Then TE stores weight tensor in the form:
+            [q1 k1 v1 q2 k2 v2 ...]
+            This is done to maximally optimize performance time.
+            """
+            te_qkv_layer = te_state_dict[layer_prefix + "self_attention.layernorm_qkv.weight"]
+
+            def copy_interleave(hf_name, idx):
+                src = hf_state_dict[layer_prefix + hf_name]
+                for head_nr in range(config.num_attention_heads):
+                    dst_offset = head_nr * config.head_dim * 3
+                    dst_slice = slice(
+                        dst_offset + idx * config.head_dim, dst_offset + (idx + 1) * config.head_dim
+                    )
+                    src_slice = slice(
+                        head_nr * config.head_dim, head_nr * config.head_dim + config.head_dim
+                    )
+                    te_qkv_layer[dst_slice, :] = src[src_slice, :]
+
+            copy_interleave("self_attn.q_proj.weight", 0)
+            copy_interleave("self_attn.k_proj.weight", 1)
+            copy_interleave("self_attn.v_proj.weight", 2)
+        else:
+            copy_from_ht_to_te(
+                "self_attention.layernorm_qkv.query_weight", "self_attn.q_proj.weight"
+            )
+            copy_from_ht_to_te("self_attention.layernorm_qkv.key_weight", "self_attn.k_proj.weight")
+            copy_from_ht_to_te(
+                "self_attention.layernorm_qkv.value_weight", "self_attn.v_proj.weight"
+            )
+
+    return all_layer_prefixes
diff --git a/docs/examples/te_gemma/tutorial_accelerate_hf_gemma_finetuning_with_te.ipynb b/docs/examples/te_gemma/tutorial_accelerate_hf_gemma_finetuning_with_te.ipynb
new file mode 100644
index 0000000000..7875ffc9f3
--- /dev/null
+++ b/docs/examples/te_gemma/tutorial_accelerate_hf_gemma_finetuning_with_te.ipynb
@@ -0,0 +1,314 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Accelerating a Hugging Face Gemma model finetuning with Transformer Engine"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the previous [tutorial](../te_llama/tutorial_accelerate_hf_llama_finetuning_with_te.ipynb), we demonstrated how to accelerate HF Llama models using the Transformer Engine library. We replaced `LlamaDecoderLayer` with `TransformerLayer` from the Transformer Engine, achieving a speedup. Furthermore, we conducted the finetuning in FP8 precision, which yielded an additional speedup.\n",
+    "\n",
+    "Now, we will undertake a similar enhancement for the Google's [Gemma](https://blog.google/technology/developers/gemma-open-models/) model."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Dependencies for this tutorial\n",
+    "\n",
+    "Following files and media are necessary to effectively run this tutorial:\n",
+    "\n",
+    "1. `te_gemma.py`\n",
+    "    - This file contains the code to load a Hugging Face Gemma checkpoint in Transformer Engine's `TransformerLayer` instead of Hugging Face's `GemmaDecoderLayer`. This is used in the following two sections of the tutorial - \"Improvement 1\" and \"Improvement 2\".\n",
+    "2. `utils.py`\n",
+    "    - This file contains the code related to dataloading, hyperparameters, setting up model/optimizers/accelerator, model training and other miscellaneous tasks like restarting the jupyter notebook from within the cell. \n",
+    "3. `requirements.txt`\n",
+    "    - This file contains necessary Python packages for this tutorial.\n",
+    "4. `media/`\n",
+    "    - This directory contains the images used in the following tutorial."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install -r requirements.txt\n",
+    "\n",
+    "import torch\n",
+    "cudnn_version = torch.backends.cudnn.version()\n",
+    "assert cudnn_version >= 90100, \"cuDNN version >= 9.1.0 is needed to run this tutorial.\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Differences between Llama and Gemma"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Thr Llama and the Gemma are very similar models - both are based on Transformer Decoder architecture. The most important architectural differences between them are the following:\n",
+    "\n",
+    "\n",
+    "| Feature                                      | Llama                              | Gemma                                      |\n",
+    "|----------------------------------------------|------------------------------------|--------------------------------------------|\n",
+    "| **Norm Layer**                               | Standard RMSNorm <br> $y = \\frac{x - \\mathrm{E}[x]}{ \\sqrt{\\mathrm{Var}[x] + \\varepsilon}} * \\gamma + \\beta$                   | RMSNorm with zero centered gamma parameter <br>  $y = \\frac{x - \\mathrm{E}[x]}{ \\sqrt{\\mathrm{Var}[x] + \\varepsilon}} * (\\textcolor{red}{1 +} \\gamma) + \\beta$   |\n",
+    "| **Embedding Dimension/Head Dimension**             | 4096/4096                              | 3072/4096                                  |\n",
+    "| **Activation Function**                      | SwiGlu                             | GeGlu                                      |\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## [Baseline] Running HF `GemmaModel` (Precision: `BF16`)\n",
+    "\n",
+    "Similarly to the Llama tutorial, we begin the experiments by running baseline Hugging Face Gemma model finetuning in BF16 precision.\n",
+    "\n",
+    "<div class=\"alert alert-info\">\n",
+    "\n",
+    "<b>Note</b>\n",
+    "    \n",
+    "This tutorial loads and trains a Gemma 7B model which takes up most of the GPU memory and therefore, we need to restart the jupyter notebook each time before running the following sections. A small utility method `restart_jupyter_notebook` is defined in the accompanying `utils.py` file. This function restarts the jupyter notebook so that the GPU memory is flushed before the model is loaded again from the checkpoint in order to avoid running into OOM (Out Of Memory) errors.\n",
+    "\n",
+    "If the utility doesn't work, comment this line `restart_jupyter_notebook()` in the following cell and manually restart the jupyter notebook before running the cell. Repeat the same for other sections in this tutorial.\n",
+    "\n",
+    "</div>\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "10 finetuning steps complete!\n",
+      "\n",
+      "Average time taken per step: \n",
+      "298 \n",
+      "milliseconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "\n",
+    "# Import necessary packages and methods\n",
+    "from utils import *\n",
+    "\n",
+    "\n",
+    "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n",
+    "## !!! `model_name` attr must point to the location of the model weights !!!\n",
+    "## Weights can be downloaded from: https://huggingface.co/google/gemma-7b\n",
+    "hyperparams.model_name = \"../../../../gemma-7b\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "hyperparams.mixed_precision = \"bf16\"\n",
+    "\n",
+    "\n",
+    "# Init the model and accelerator wrapper\n",
+    "model = init_baseline_model(hyperparams).cuda()\n",
+    "accelerator, model, optimizer, train_dataloader, lr_scheduler = wrap_with_accelerator(model, hyperparams)\n",
+    "\n",
+    "\n",
+    "# Finetune the model\n",
+    "finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer, lr_scheduler)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's add this information in a table and keep comparing it with a few possible improvements in future sections:\n",
+    "\n",
+    "| Models                                                      | Precision | Step Time (or ms per batch) | Speedup (over baseline) |\n",
+    "|-------------------------------------------------------------|-----------|-----------------------------|-------------------------|\n",
+    "| HF (baseline)                                               | BF16      | 298                         | 1                       |"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## [Improvement 1] Replace HF's `GemmaDecoderLayer` with TE's `TransformerLayer` (Precision: `BF16`)\n",
+    "\n",
+    "We replace *GemmaDecoderLayer* with the highly tuned *TransformerLayer*, similarly to our approach in the [Llama tutorial](../te_llama/tutorial_accelerate_hf_llama_finetuning_with_te.ipynb). Let's observe the impact this change has on the model's speed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "10 finetuning steps complete!\n",
+      "\n",
+      "Average time taken per step: \n",
+      "257 \n",
+      "milliseconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "\n",
+    "# Import necessary packages and methods\n",
+    "from utils import *\n",
+    "\n",
+    "\n",
+    "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n",
+    "## !!! `model_name` attr must point to the location of the model weights !!!\n",
+    "## Weights can be downloaded from: https://huggingface.co/google/gemma-7b\n",
+    "hyperparams.model_name = \"../../../../gemma-7b\"  # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "hyperparams.mixed_precision = \"bf16\"\n",
+    "\n",
+    "\n",
+    "# Init the model and accelerator wrapper\n",
+    "model = init_te_gemma_model(hyperparams).cuda()\n",
+    "accelerator, model, optimizer, train_dataloader, lr_scheduler = wrap_with_accelerator(model, hyperparams)\n",
+    "\n",
+    "\n",
+    "# Finetune the model\n",
+    "finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer, lr_scheduler)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Compared to the \"baseline\" implementation, we see that using Transformer Engine's `TransformerLayer` in place of Huggging Face's `GemmaDecoderLayer` gives a speedup of **16%** even when using only BF16 precision!\n",
+    "\n",
+    "| Models                                                      | Precision | Step Time (or ms per batch) | Speedup (over baseline) |\n",
+    "|-------------------------------------------------------------|-----------|-----------------------------|-------------------------|\n",
+    "| HF (baseline)                                               | BF16      | 298                        | 1                       |\n",
+    "| TE (replace `GemmaDecoderLayer` with `TE.TransformerLayer`) | BF16      | 257                         | 1.16                    |"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## [Improvement 2] Replace HF's `GemmaDecoderLayer` with TE's `TransformerLayer` (Precision: `FP8`)\n",
+    "\n",
+    "The last improvement is about enabling FP8 precision. Let's see how it works."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "10 finetuning steps complete!\n",
+      "\n",
+      "Average time taken per step: \n",
+      "214 \n",
+      "milliseconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "#restart_jupyter_notebook()\n",
+    "\n",
+    "\n",
+    "# Import necessary packages and methods\n",
+    "from utils import *\n",
+    "\n",
+    "\n",
+    "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n",
+    "## !!! `model_name` attr must point to the location of the model weights !!!\n",
+    "## Weights can be downloaded from: https://huggingface.co/google/gemma-7b\n",
+    "hyperparams.model_name = \"../../../../gemma-7b\"  # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "hyperparams.mixed_precision = \"fp8\"\n",
+    "\n",
+    "\n",
+    "# Init the model and accelerator wrapper\n",
+    "model = init_te_gemma_model(hyperparams).cuda()\n",
+    "accelerator, model, optimizer, train_dataloader, lr_scheduler = wrap_with_accelerator(model, hyperparams)\n",
+    "\n",
+    "\n",
+    "# Finetune the model\n",
+    "finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer, lr_scheduler)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "| Models                                                      | Precision | Step Time (or ms per batch) | Speedup (over baseline) |\n",
+    "|-------------------------------------------------------------|-----------|-----------------------------|-------------------------|\n",
+    "| HF (baseline)                                               | BF16      | 298                        | 1                       |\n",
+    "| TE (replace `GemmaDecoderLayer` with `TE.TransformerLayer`) | BF16      | 257                         | 1.16                    |\n",
+    "| TE (replace `GemmaDecoderLayer` with `TE.TransformerLayer`) | FP8       | 214                         | 1.39                    |\n",
+    "\n",
+    "\n",
+    "After turning on FP8 precision, we get even more speedup of almost **39%**!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Conclusion\n",
+    "\n",
+    "As shown in the [Llama tutorial](../te_llama/tutorial_accelerate_hf_llama_finetuning_with_te.ipynb), using the `TransformerLayer` module from Transformer Engine to replace Hugging Face's `GemmaDecoderLayer` results in a speedup compared to Hugging Face's native Gemma implementation."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## See more\n",
+    "\n",
+    "We also prepared [tutorial](./tutorial_generation_gemma_with_te.ipynb) in which we will show how to speedup the Gemma model generation using Transformer Engine."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/examples/te_gemma/tutorial_generation_gemma_with_te.ipynb b/docs/examples/te_gemma/tutorial_generation_gemma_with_te.ipynb
new file mode 100644
index 0000000000..1948a1481b
--- /dev/null
+++ b/docs/examples/te_gemma/tutorial_generation_gemma_with_te.ipynb
@@ -0,0 +1,874 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "40364db7",
+   "metadata": {},
+   "source": [
+    "# Accelerating token generation of the Hugging Face Gemma Model with Transformer Engine\n",
+    "\n",
+    "Generative AI has made remarkable strides in recent years, with Large Language Models (LLMs) like ChatGPT at the forefront. These models have revolutionized how we interact with machine-generated content, providing capabilities that range from writing assistance to complex decision support. The core functionality of these models is the generation process, which involves predicting the next token in a sequence based on the preceding text. This task is critical for applications such as automated content creation, translation, and more, emphasizing the importance of efficient implementation.\n",
+    "\n",
+    "\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/generation_animation.gif\" alt=\"\" >\n",
+    "<figcaption>\n",
+    "Animation 1: Hugging Face Gemma model token generation.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "For those seeking a deeper understanding of text generation mechanisms in Transformers, it is recommended to check out the [HuggingFace generation tutorial](https://huggingface.co/docs/transformers/llm_tutorial).\n",
+    "\n",
+    "In the previous tutorials on [Llama](../te_llama/tutorial_accelerate_hf_llama_finetuning_with_te.ipynb) and [Gemma](./tutorial_accelerate_hf_gemma_finetuning_with_te.ipynb), it was demonstrated how finetuning can be accelerated using the Transformer Engine's `TransformerLayer`. Building on this foundation, the current objective is to enhance the generation speed of the Gemma model.\n",
+    "\n",
+    "This tutorial will introduce and explain several advanced features of the Transformer Engine that contribute to this goal:\n",
+    "\n",
+    "###### **1. THD Attention Layout.**\n",
+    "\n",
+    "Addressing the challenge of computing attention for sequences with varying lengths, a common method is to pad these sequences and apply an attention mask. The Transformer Engine, however, offers a more optimized approach—by specifying the lengths and offsets of the sequences, attention can be computed directly. Instead of passing the tensor with shape `[b, s, h, d]` and the attention mask, one can pass a tensor of the shape `[t, h, d]` along with tensors detailing cumulative sequence lengths and offsets to run the attention optimized for this case. This specific attention layout is referred to as the **THD layout**. \n",
+    "\n",
+    "\n",
+    "The letter `t` in the standard `[t, h, d]` layout is equal to the total length of the sequences, namely `t = s_1 + s_2 + ... + s_b`, where `s_i` denotes the length of sequence `i`. TransformerEngine supports a THD layout that incorporates gaps between these sequences - the lengths of the offsets need to be passed in the additional parameter.\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/thd_bshd.svg\" alt=\"\">\n",
+    "<figcaption>\n",
+    "Figure 1: The difference between BSHD (default) and THD attention layouts is as follows: with BSHD, one needs to provide the attention mask, while with THD, one needs to provide cumulative sequence lengths and sequence offsets.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "###### **2. CUDA Graphs API.**\n",
+    "\n",
+    "The speed of GPUs is increasing at a rapid pace. It turns out that sometimes the runtime of kernels is shorter than the time it takes for the CPU to submit them, which can lead to significant overhead. CUDA Graphs can address this issue. When certain kernels are executed repeatedly, it allows us to record and replay them with less CPU involvement. This becomes particularly useful in applications like token generation, where a `TransformerLayer` is run for every token that needs to be generated.\n",
+    "\n",
+    "One can read more about CUDA Graphs [here](https://developer.nvidia.com/blog/cuda-graphs/).\n",
+    "\n",
+    "PyTorch exposes graphs via a raw `torch.cuda.CUDAGraph` class and two convenience wrappers: `torch.cuda.graph` and `torch.cuda.make_graphed_callables`. More information about the cuda graphs in Pytorch can be found [here](https://pytorch.org/blog/accelerating-pytorch-with-cuda-graphs/).\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/graphs.svg\" alt=\"\">\n",
+    "<figcaption>\n",
+    "Figure 2: CUDA Graphs reduce the overhead generated by the long time it takes to launch a single kernel. It enables the recording and replaying of subsequent launches, thus reducing the total time used by the CPU.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "\n",
+    "###### **3. FP8 Weights Calibration.**\n",
+    "\n",
+    "Assuming that the model is trained in FP32/BF16 precision and the goal is to execute it in FP8 precision, the process isn't straightforward due to the absence of appropriate FP8 scaling factors. In this scenario, FP8 calibration becomes essential. By conducting several forward passes on sample data, the FP8 scaling parameters can be computed. This calibration allows the model to operate correctly in FP8 precision.\n",
+    "\n",
+    "It is highly recommended to familiarize oneself with the [tutorial](../../examples/fp8_primer.ipynb) on FP8 precision to understand the importance of proper scaling factors.\n",
+    "\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/calibration.svg\" alt=\"\">\n",
+    "<figcaption>\n",
+    "Figure 3:\n",
+    "If the model is trained in BF16/FP32, it does not include the computed FP8 scaling factors. When it is run under <b>fp8_autocast()</b>, the value of these scaling factors will default to their initial values, which can cause numerical errors. Weight calibration involves calculating FP8 scaling factors from higher precision forward passes. Once these factors are computed, the model becomes numerically stable. \n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "###### **4. FP8 Model Weights.**\n",
+    "\n",
+    "The typical approach is to store weights in higher precision and then cast them to fp8 before operations. This may prevent accuraccy drops in training. However, for inference, this level of precision is not necessary.\n",
+    "\n",
+    "The TransformerEngine includes a wrapper `fp8_model_​init`, which allows for the creation of models that store only the FP8 copy of the weights. This eliminates the need to cast from higher precision to BF16, saving time in this casting process. \n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/fp8_model_init.svg\" alt=\"\">\n",
+    "<figcaption>\n",
+    "Figure 4: Model under <b>fp8_autocast()</b> stores weights in high precision by default, and casts them if needed. It can leads to slowdown and increased memory usage. Using <i>fp8_model_init()</i> results in storing weight in FP8.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "###### Benchmarking\n",
+    "\n",
+    "We'll evaluate the generation time across one benchmark: generation with context phase max sequence length = 128, batch size = 64 and number of generated tokens = 896 on random texts with random lengths.\n",
+    "\n",
+    "<div class=\"alert alert-info\">\n",
+    "<b>Note</b>\n",
+    "    \n",
+    "This tutorial focuses on showcasing the mentioned features of Transformer Engine in the context of token generation. It's important to note, however, that NVIDIA provides [TensorRT](https://developer.nvidia.com/tensorrt), which is optimized for inference tasks and should be considered for such use cases.\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b18f91a9",
+   "metadata": {},
+   "source": [
+    "## Dependencies for this tutorial"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e5201d77",
+   "metadata": {},
+   "source": [
+    "Following files and media are necessary to effectively run this tutorial:\n",
+    "\n",
+    "1. `te_gemma.py`\n",
+    "    - This file contains the code to load a Hugging Face Gemma checkpoint in Transformer Engine's `TransformerLayer` instead of Hugging Face's `GemmaDecoderLayer`. It does also contain code for generation with THD attention, CUDA Graphs and weight calibration.\n",
+    "2. `te_gemma_loading_weights.py`\n",
+    "    - This file contains logic of mapping the parameters from `GemmaDecoderLayer` into the `TransformerLayer`.\n",
+    "3. `utils.py`\n",
+    "    - This file contains the code related to dataloading, hyperparameters, setting up model/optimizers/accelerator, model training and other miscellaneous tasks like restarting the jupyter notebook from within the cell. \n",
+    "4. `requirements.txt`\n",
+    "    - This file contains necessary Python packages for this tutorial.\n",
+    "5. `media/`\n",
+    "    - This directory contains the images used in the following tutorial."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "31390c76",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install -r requirements.txt\n",
+    "\n",
+    "import torch\n",
+    "cudnn_version = torch.backends.cudnn.version()\n",
+    "assert cudnn_version >= 90100, \"cuDNN version >= 9.1.0 is needed to run this tutorial.\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e8dfabbf",
+   "metadata": {},
+   "source": [
+    "\n",
+    "|\n",
+    "## [Baseline] Running Hugging Face generation with Gemma model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "59560bff",
+   "metadata": {},
+   "source": [
+    "HuggingFace Transformers library offers generation API. \n",
+    "HuggingFace generation for the Gemma model will be used as a baseline."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "2803e0ec",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "============================== Generation example 1 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "\n",
+      "Generated text:\n",
+      "\n",
+      "1. They are very good at doing the same thing over and over again.\n",
+      "2. They are very bad at doing different things at the same time.\n",
+      "\n",
+      "The first fact is why GPUs are so good at rendering video games. The second\n",
+      "============================== Generation example 2 ==============================\n",
+      "Prompt:\n",
+      "Some facts about NVIDIA:\n",
+      "\n",
+      "Generated text:\n",
+      "\n",
+      "* NVIDIA is a global technology leader in the design and manufacture of \n",
+      "  advanced microprocessors for the PC and mobile computing markets.\n",
+      "* NVIDIA is a leading provider of graphics processing units (GPUs) for the PC and mobile computing markets.\n",
+      "*\n",
+      "============================== Benchmarking ==============================\n",
+      "Benchmarking for batch_size = 64 and max total tokens = 1024\n",
+      "Time: 87.68 s.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "from utils import *\n",
+    "\n",
+    "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n",
+    "# !!! `model_name` attr must point to the location of the model weights !!!\n",
+    "# Weights can be downloaded from: https://huggingface.co/google/gemma-7b.\n",
+    "# Weights should be in the *.safetensors HF format, not in the original format.\n",
+    "hyperparams.model_name = \"\"  # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "\n",
+    "model = init_baseline_model(hyperparams)\n",
+    "\n",
+    "print_sample_of_generated_texts(model)\n",
+    "benchmark_generation(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b3698dc6",
+   "metadata": {},
+   "source": [
+    "Let's put this time into the table for later comparison.\n",
+    "\n",
+    "| Models                                                      | Time (s) | Speedup |  \n",
+    "|-------------------------------------------------------------|---------------------------------------|--------------------------------------|\n",
+    "| HF (baseline)                                               | 87.68      | 1                         |"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8bb40f45",
+   "metadata": {},
+   "source": [
+    "## [Improvement 1] Using TransformerLayer from Transformer Engine instead of GemmaDecoderLayer."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "263b40f2",
+   "metadata": {},
+   "source": [
+    "As in the [Gemma](./tutorial_accelerate_hf_gemma_finetuning_with_te.ipynb) finetuning tutorial, a GemmaDecoderLayer is substituted by a tuned TransformerLayer from the Transformer Engine. Let's run it and compare the time with the baseline."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "9dceef93",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "============================== Generation example 1 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "\n",
+      "Generated text:\n",
+      "\n",
+      "1. GPUs are very good at doing the same thing over and over again.\n",
+      "2. GPUs are very bad at doing different things at the same time.\n",
+      "\n",
+      "The first fact is why GPUs are so good at graphics. The second fact is why\n",
+      "============================== Generation example 2 ==============================\n",
+      "Prompt:\n",
+      "Some facts about NVIDIA:\n",
+      "\n",
+      "Generated text:\n",
+      "\n",
+      "* NVIDIA is a global technology company that designs and develops high-performance computer\n",
+      "* graphics and video processing chips.\n",
+      "* The company was founded in 1993 by Jen-Hsun Huang, Chris Malachowsky, and Curtis Priem.\n",
+      "============================== Benchmarking ==============================\n",
+      "Benchmarking for batch_size = 64 and max total tokens = 1024\n",
+      "Time: 54.11 s.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "from utils import *\n",
+    "\n",
+    "hyperparams.model_name = \"\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "\n",
+    "model = init_te_gemma_model(hyperparams)\n",
+    "\n",
+    "print_sample_of_generated_texts(model)\n",
+    "benchmark_generation(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b5d40836",
+   "metadata": {},
+   "source": [
+    "The speedup of **62%** was obtained."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "006d18e8",
+   "metadata": {},
+   "source": [
+    "| Models                                                      | Time (s) | Speedup |  \n",
+    "|-------------------------------------------------------------|---------------------------------------|--------------------------------------|\n",
+    "| HF (baseline)                                               | 87.68      | 1                         |\n",
+    "| TE (subsitution of GemmaDecoderLayer with te.TransformerLayer)                                              | 54.11      | 1.62                         | "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2bbf3d47",
+   "metadata": {},
+   "source": [
+    "## [Improvement 2] Use of THD attention layout.\n",
+    "\n",
+    "Input sequences can have various lengths. Hugging Face generation – as can be seen in Animation 1 – pads the sequences and then uses attention mask. In the THD attention layout cumulative sequence lengths and offsets need to be provided, instead of attention mask. The THD attention layout is much more optimized than BSHD layout.\n",
+    "\n",
+    "The class `transformer_engine.pytorch.DotProductAttention` supports this format. One need to pass the following things as the arguments to the forward:\n",
+    "- `seq_offsets_q`, `seq_offsets_k`, `seq_offsets_v` – offsets of the beginnings of the next sequences,\n",
+    "- `cu_seqlens_q`, `cu_seqlens_kv` – cumulative sum of the lengths of the sequences of query and values,\n",
+    "- `max_seqlen_q` – maximum sequence length in query layer,\n",
+    "- `max_seqlen_kv` – maximum sequence length in key-value layer.\n",
+    "\n",
+    "<div class=\"alert alert-info\">\n",
+    "<b>Note</b>\n",
+    "\n",
+    "Currently, the THD attention for `TransformerLayer` is supported only for token generation.\n",
+    "</div>\n",
+    "\n",
+    "Let's look how using TransformerEngine with THD attention impacts the speed of token generation:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "4fc5e1cd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "============================== Generation example 1 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "\n",
+      "Generated text:\n",
+      "\n",
+      "1. They are very good at doing the same thing over and over again.\n",
+      "2. They are very bad at doing different things at the same time.\n",
+      "\n",
+      "The first fact is why GPUs are so good at rendering video games. The second fact\n",
+      "============================== Generation example 2 ==============================\n",
+      "Prompt:\n",
+      "Some facts about NVIDIA:\n",
+      "\n",
+      "Generated text:\n",
+      "\n",
+      "* NVIDIA is a global technology company that designs and develops high-performance computing \n",
+      "  and graphics processing units (GPUs) for the gaming, professional visualization, and data center markets.\n",
+      "* The company was founded in 1993 and is headquartered\n",
+      "============================== Benchmarking ==============================\n",
+      "Benchmarking for batch_size = 64 and max total tokens = 1024\n",
+      "Time: 28.22  s.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "from utils import *\n",
+    "\n",
+    "hyperparams.model_name = \"\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "hyperparams.qkv_format = \"thd\"\n",
+    "\n",
+    "model = init_te_gemma_model(hyperparams)\n",
+    "\n",
+    "print_sample_of_generated_texts(model)\n",
+    "benchmark_generation(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8e397a65",
+   "metadata": {},
+   "source": [
+    "By using THD attention, the following speedup was obtained:\n",
+    "\n",
+    "| Models                                                      | Time (s) | Speedup |  \n",
+    "|-------------------------------------------------------------|---------------------------------------|--------------------------------------|\n",
+    "| HF (baseline)                                               | 87.68      | 1                         |\n",
+    "| TE (subsitution of GemmaDecoderLayer with te.TransformerLayer)                                              | 54.11      | 1.62                         | \n",
+    "| TE + THD attention                                               | 28.22      | 3.11                         |  "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "21a89d9c",
+   "metadata": {},
+   "source": [
+    "## [Improvement 3] Speeding up generation with CUDA Graphs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e2d53e7b",
+   "metadata": {},
+   "source": [
+    "TransformerEngine includes a function `transformer_engine.pytorch.make_graphed_callables`, which functions similarly to the corresponding feature in PyTorch. It is capable of recording any modules from the Transformer Engine. Below is a code excerpt from `te_gemma.py` from class `TEGemmaForCausalLMCudaGraphs`:\n",
+    "```\n",
+    "    def __init__(self, config : GemmaConfig):\n",
+    "            (...)\n",
+    "            \n",
+    "            # Here \"the trick\" happens. We override methods from TEGemmaForCausalLM\n",
+    "            # with their recorded version. After invocation of each of them,\n",
+    "            # captured graph will be replayed with minimal usage of CPU,\n",
+    "            # what will lead to huge speedup.\n",
+    "            (...)\n",
+    "            self._model_context_phase = \n",
+    "                self.record_graph(self._model_context_phase, self.hidden_states_buffer) # CUDA Graphs recording\n",
+    "\n",
+    "            (...)        \n",
+    "            self._model_generation_phase = \n",
+    "                self.record_graph(self._model_generation_phase, self.generation_buffer) # CUDA Graphs recording\n",
+    "\n",
+    "    @torch.no_grad()\n",
+    "    def record_graph(self, function, input_tensor):\n",
+    "        (...)\n",
+    "        # function is invoked on argument (self.hidden_states,) and all kernels are recorded.\n",
+    "        # record_graph() returns captured function, which can be run later with minimal use of th CPU.\n",
+    "        fp8_format = Format.HYBRID\n",
+    "        fp8_recipe = DelayedScaling(fp8_format=fp8_format, amax_history_len=32, amax_compute_algo=\"max\")\n",
+    "        with autocast(dtype=torch.bfloat16, cache_enabled=False):\n",
+    "            graphed_function = te.pytorch.make_graphed_callables(\n",
+    "                function, \n",
+    "                (input_tensor,), \n",
+    "                fp8_enabled=True, \n",
+    "                fp8_recipe=fp8_recipe, \n",
+    "                allow_unused_input=True,\n",
+    "                num_warmup_iters=3\n",
+    "            )\n",
+    "        return graphed_function\n",
+    "```\n",
+    "\n",
+    "It is strongly reccomended to review the entire code of the class `TEGemmaForCausalLMCudaGraphs`. Let's now proceed to evaluate the performance improvement offered by CUDA Graphs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "31a3a8a3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "============================== Generation example 1 ==============================\n",
+      "Two facts about GPUs:\n",
+      "\n",
+      "1. They are very good at doing the same thing over and over again.\n",
+      "2. They are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is why they are so good at rendering graphics.\n",
+      "\n",
+      "The first fact is the\n",
+      "============================== Generation example 2 ==============================\n",
+      "Two facts about NVIDIA:\n",
+      "\n",
+      "1. It is the world’s largest manufacturer of graphics processing units (GPUs) for the gaming industry.\n",
+      "2. It is the world’s largest manufacturer of GPUs for the data center industry.\n",
+      "\n",
+      "The company’s stock price has\n",
+      "============================== Benchmarking ==============================\n",
+      "Benchmarking for batch_size = 64 and max total tokens = 1024\n",
+      "Time: 16.75 s.\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "from utils import *\n",
+    "\n",
+    "hyperparams.model_name = \"\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "hyperparams.qkv_format = \"thd\"\n",
+    "\n",
+    "hyperparams.generation_cuda_graphs = True\n",
+    "\n",
+    "# It is necessary to preallocate a static buffer.\n",
+    "# CUDA graphs require static input tensors for every kernel.\n",
+    "# This approach may result in a slight increase in memory consumption;\n",
+    "# however, the substantial speedup achieved makes it worthwhile.\n",
+    "hyperparams.cuda_graphs_static_batch_size = 64\n",
+    "hyperparams.cuda_graphs_static_max_seq_len = 1024\n",
+    "hyperparams.cuda_graphs_static_max_context_len = 128\n",
+    "model = init_te_gemma_model(hyperparams)\n",
+    "\n",
+    "print_sample_of_generated_texts(model)\n",
+    "benchmark_generation(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "53bb430f",
+   "metadata": {},
+   "source": [
+    "The **5.23x** speedup was obtained.\n",
+    "\n",
+    "| Models                                                      | Time (s) | Speedup |  \n",
+    "|-------------------------------------------------------------|---------------------------------------|--------------------------------------|\n",
+    "| HF (baseline)                                               | 87.68      | 1                         |\n",
+    "| TE (subsitution of GemmaDecoderLayer with te.TransformerLayer)                                              | 54.11      | 1.62                         | \n",
+    "| TE + THD attention                                               | 28.22      | 3.11                         |  \n",
+    "| TE + THD attention + CUDA Graphs                                             | 16.75      | 5.23                         |  \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0a11b75c",
+   "metadata": {},
+   "source": [
+    "Let's look at the screenshots from *NVIDIA Nsight System* profiler to see where this speedup comes from:\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/graphs_1.png\" width=\"80%\">\n",
+    "<figcaption>\n",
+    "Figure 5: Without CUDA Graphs. One can see that GPU (blue) is idle for big portion of the time.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/graphs_2.png\" width=\"80%\">\n",
+    "<figcaption>\n",
+    "Figure 6: With CUDA Graphs. One can see that GPU (orange) is fully utilized.\n",
+    "</figcaption>\n",
+    "</figure>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e6b171a0",
+   "metadata": {},
+   "source": [
+    "## [Improvement 4] Running generation in FP8 of the model trained in higher precision "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1a80288b",
+   "metadata": {},
+   "source": [
+    "Implementing FP8 generation with the Gemma model is not straightforward, because this model was initially trained using BF16 precision, and the necessary FP8 scaling factors are missing. Running the model at this lower precision without proper scaling could lead to significant errors and incorrect results.\n",
+    "\n",
+    "It is highly recommended to familiarize oneself with the [tutorial](../../examples/fp8_primer.ipynb) on FP8 precision to understand the necessity of scaling.\n",
+    "\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/calibration_1_half.svg\">\n",
+    "<figcaption>\n",
+    "    Figure 8: The FP8 scaling factors are incorrect and that leads to numerical errors. The weight calibration allows us to compute FP8 metadata during the forwards in higher precision.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "### Weight Calibration\n",
+    "\n",
+    "To address the issue outlined above, weight calibration will be used. This involves running several forward iterations at BF16 precision within the context `te.fp8_autocast(enabled=False, calibration=True)`. This setup allows the forward pass to operate at higher precision, while simultaneously collecting `amax_history` and other parameters related to the FP8 precision, which are essential for calculating the FP8 scaling well.\n",
+    "\n",
+    "The code below outlines the steps to initialize the BF16 model and conduct several forward iterations within the specified context. After these iterations, the model is saved, and these weights will be utilized in subsequent chapters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "aecee0e1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "from utils import *\n",
+    "import transformer_engine.pytorch as te\n",
+    "\n",
+    "hyperparams.model_name = \"\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "hyperparams.fuse_qkv_params = True # This is needed by the last improvement.\n",
+    "\n",
+    "model = init_te_gemma_model(hyperparams)\n",
+    "\n",
+    "# Calibration\n",
+    "with te.fp8_autocast(enabled=False, calibrating=True), \\\n",
+    "    torch.autocast(device_type='cuda', dtype=torch.bfloat16):\n",
+    "    model.train()\n",
+    "    run_forward_pass(model, hyperparams, num_iters=512)\n",
+    "\n",
+    "# Compute scale_fwd with enabled fp8 autocast\n",
+    "with te.fp8_autocast(enabled=True), \\\n",
+    "    torch.autocast(device_type='cuda', dtype=torch.bfloat16):\n",
+    "    run_forward_pass(model, hyperparams, 1)\n",
+    "\n",
+    "# Some parameters are in pointing to the same tensors, double save is avoided here.\n",
+    "dict_to_save = {k: v for k, v in model.state_dict().items() \\\n",
+    "                if (\"_context_phase\" not in k and \"_generation_phase\" not in k)}\n",
+    "torch.save(dict_to_save, '') # <== Add path to save calibrated weights."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b6dcd135",
+   "metadata": {},
+   "source": [
+    "|\n",
+    "### Generation in FP8\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/calibration_2_half.svg\">\n",
+    "<figcaption>\n",
+    "    Figure 8: After the weight calibration FP8 scaling factors are correct and prevent numerical errors.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "Now FP8 inference is ready to be run."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "a913f54d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "============================== Generation example 1 ==============================\n",
+      "Two facts about GPUs:\n",
+      "\n",
+      "1. They are exorbitantly expensive.\n",
+      "2. They are exorbitantly powerful.\n",
+      "\n",
+      "The first fact is a bummer, but the second fact is a boon. GPUs are exorbitantly powerful \n",
+      "because they are exorbitantly expensive. GPUs are exorbitantly expensive\n",
+      "============================== Generation example 2 ==============================\n",
+      "Two facts about NVIDIA:\n",
+      "\n",
+      "1. NVIDIA is a company that makes graphics cards for computers.\n",
+      "2. NVIDIA is a company that makes graphics cards for computers.\n",
+      "\n",
+      "The first fact is true. The second fact is true.\n",
+      "\n",
+      "<h2>NVIDIA is a company that makes graphics cards\n",
+      "============================== Benchmarking ==============================\n",
+      "Benchmarking for batch_size = 64 and max total tokens = 1024\n",
+      "Time: 19.31 s.\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "from utils import *\n",
+    "\n",
+    "hyperparams.model_name = \"\"   # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "hyperparams.qkv_format = \"thd\"\n",
+    "hyperparams.fuse_qkv_params = True # This is needed by the last improvement.\n",
+    "\n",
+    "hyperparams.fp8 = True\n",
+    "# Calibrated fp8 weights are loaded directly from the file.\n",
+    "\n",
+    "hyperparams.fp8_model_weights_filename = \"\" # <== Add calibrated weights location here.\n",
+    "\n",
+    "hyperparams.generation_cuda_graphs = True\n",
+    "hyperparams.cuda_graphs_static_batch_size = 64\n",
+    "hyperparams.cuda_graphs_static_max_seq_len = 1024\n",
+    "hyperparams.cuda_graphs_static_max_context_len = 128\n",
+    "model = init_te_gemma_model(hyperparams)\n",
+    "\n",
+    "print_sample_of_generated_texts(model)\n",
+    "benchmark_generation(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8cdbb56c",
+   "metadata": {},
+   "source": [
+    "One can observe that the outputs are coherent; however, the generation time has increased. Why is this the case?\n",
+    "\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/fp8_model_init_1_half.svg\">\n",
+    "<figcaption>\n",
+    "    Figure 9: Running the model at higher precision involves only one GEMM operation. However, when the model operates in FP8, it requires not just the low-precision GEMM but also weight casting.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "Running the model in FP8 does not imply that all weights are stored in FP8. By default, they are stored in higher precision and are cast to FP8, using saved scaling factors, before operations such as GEMMs.\n",
+    "\n",
+    "This approach is beneficial during training: one can perform one cast for both backward and forward passes, leading to speedups. However, performing a single cast for each forward pass introduces too much overhead to achieve a speedup. This issue will be addressed in the next section of the tutorial."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8d3945e3",
+   "metadata": {},
+   "source": [
+    "### Use of only FP8 model weights"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2dd0cba9",
+   "metadata": {},
+   "source": [
+    "TransformerEngine stores parameters in higher precision and only casts them to FP8. It may be necessary to maintain accucacy during training. However, high precision is not needed when doing inference. \n",
+    "\n",
+    "Transformer Engine supports maintaining only FP8 weights with `fp8_model_init` decorator. Let's see an example\n",
+    "```\n",
+    "linear = te.Linear(1024, 1024) # this module is initialized with full precision weights\n",
+    "with te.fp8_model_init(enabled=True):\n",
+    "    linear_fp8 = te.Linear(1024, 1024) # this module is initialized only with fp8 weights\n",
+    "\n",
+    "assert type(linear.weight.data) is torch.Tensor\n",
+    "assert type(linear_fp8.weight.data) is te.float8_tensor.Float8Tensor\n",
+    "```\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/fp8_model_init_2_half.svg\">\n",
+    "<figcaption>\n",
+    "    Figure 10: Using fp8_model_init stores the weights directly in FP8 format, which reduces both time and memory usage.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "Let's run the code with `fp8_model_init`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "96264b9c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "============================== Generation example 1 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "\n",
+      "Generated text:\n",
+      "\n",
+      "1. GPUs are exorbitantly expensive.\n",
+      "2. GPUs are exorbitantly powerful.\n",
+      "\n",
+      "The first fact frustrates me. The second excites me.\n",
+      "\n",
+      "I’ve been using GPUs for a while now, and I’ve been using them for\n",
+      "============================== Generation example 2 ==============================\n",
+      "Prompt:\n",
+      "Some facts about NVIDIA:\n",
+      "\n",
+      "Generated text:\n",
+      "\n",
+      "* NVIDIA is a global technology company that designs and manufactures graphics processing units (GPUs)\n",
+      "  for the gaming, professional visualization, and data center markets.\n",
+      "* NVIDIA is headquartered in Santa Clara, California, and has offices in more than 25\n",
+      "============================== Benchmarking ==============================\n",
+      "Benchmarking for batch_size = 64 and max total tokens = 1024\n",
+      "Time: 12.13 s.\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "# Import necessary packages and methods\n",
+    "from utils import *\n",
+    "\n",
+    "hyperparams.model_name = \"\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "hyperparams.fuse_qkv_params = True # Needed for fp8_model_init().\n",
+    "hyperparams.qkv_format = \"thd\"\n",
+    "\n",
+    "hyperparams.fp8 = True\n",
+    "hyperparams.fp8_model_init = True # This will result in storing only fp8 weights.\n",
+    "hyperparams.fp8_model_weights_filename = \"\" # <== Add calibrated weights location here.\n",
+    "\n",
+    "hyperparams.generation_cuda_graphs = True\n",
+    "hyperparams.cuda_graphs_static_batch_size = 64\n",
+    "hyperparams.cuda_graphs_static_max_seq_len = 1024\n",
+    "hyperparams.cuda_graphs_static_max_context_len = 128\n",
+    "model = init_te_gemma_model(hyperparams)\n",
+    "\n",
+    "print_sample_of_generated_texts(model)\n",
+    "benchmark_generation(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3e30ca5a",
+   "metadata": {},
+   "source": [
+    "| Models                                                      | Time (s) | Speedup |  \n",
+    "|-------------------------------------------------------------|---------------------------------------|--------------------------------------|\n",
+    "| HF (baseline)                                               | 87.68      | 1                         |\n",
+    "| TE (subsitution of GemmaDecoderLayer with te.TransformerLayer)                                              | 54.11      | 1.62                         | \n",
+    "| TE + THD attention                                               | 28.22      | 3.11                         |  \n",
+    "| TE + THD attention + CUDA Graphs                                             | 16.75      | 5.23                         |  \n",
+    "| TE + THD attention + FP8                                             | 12.13      | 7.23                         |  \n",
+    "\n",
+    "The final speedup is **7.23x**."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c6e87275",
+   "metadata": {},
+   "source": [
+    "## Conclusions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7bb2452d",
+   "metadata": {},
+   "source": [
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/plot.svg\">\n",
+    "<figcaption>\n",
+    "    Figure 11: Times obtained with optimizations using TransformerEngine (seconds).\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "In this tutorial, we've explored three features of the Transformer Engine:\n",
+    "1. Support for the THD attention layout,\n",
+    "2. Integration with CUDA Graphs,\n",
+    "3. FP8 weights calibration,\n",
+    "4. Models containing only FP8 version of their parameters.\n",
+    "\n",
+    "Each of these features can be applied in various contexts, such as fast token generation. It's important to note that the fastest possible inference speeds can be achieved using NVIDIA's inference-optimized [TensorRT](https://developer.nvidia.com/tensorrt) library."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/examples/te_gemma/utils.py b/docs/examples/te_gemma/utils.py
new file mode 100644
index 0000000000..292a452f42
--- /dev/null
+++ b/docs/examples/te_gemma/utils.py
@@ -0,0 +1,320 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import time
+import sys
+import IPython
+import random
+import string
+
+from te_gemma_loading_weights import load_te_model
+
+import torch
+from torch.optim import AdamW
+from torch.utils.data import DataLoader
+
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    get_linear_schedule_with_warmup,
+    AutoConfig,
+)
+from transformers import DataCollatorForLanguageModeling
+from datasets import load_dataset
+from accelerate import Accelerator
+from accelerate.utils.dataclasses import FP8RecipeKwargs
+
+
+from te_gemma import TEGemmaForCausalLM, TEGemmaForCausalLMCudaGraphs
+
+
+class HyperParameters:
+    def __init__(self):
+        self.mixed_precision = "bf16"
+        self.model_name = None
+
+        self.fp8 = False
+
+        # Weights in fp8
+        self.fp8_model_weights_filename = None
+        self.fp8_model_init = False
+
+        # Cuda graphs
+        self.generation_cuda_graphs = False
+        self.cuda_graphs_static_batch_size = 16
+        self.cuda_graphs_static_max_seq_len = 256
+        self.cuda_graphs_static_max_context_len = 16
+
+        # Finetuning settings.
+        self.dataset_name = "timdettmers/openassistant-guanaco"
+        self.dataset_text_field = "text"
+        self.learning_rate = 1.41e-5
+        self.batch_size = 8
+        self.max_seq_length = 256
+        self.gradient_accumulation_steps = 1
+        self.num_warmup_steps = 5
+        self.num_training_steps = 10
+
+        # QKV format.
+        self.fuse_qkv_params = False
+        self.qkv_format = "bshd"
+
+
+hyperparams = HyperParameters()
+
+assert (
+    torch.backends.cudnn.version() >= 9100
+), "cuDNN version >= 9.1.0 is needed to run this tutorial."
+
+
+def get_dataloaders(accelerator: Accelerator, hyperparams):
+    dataset = load_dataset(hyperparams.dataset_name, split="train")
+    tokenizer = AutoTokenizer.from_pretrained(hyperparams.model_name)
+
+    def tokenize(element):
+        outputs = tokenizer(
+            element["text"],
+            truncation=True,
+            padding=False,
+            max_length=hyperparams.max_seq_length,
+            return_overflowing_tokens=False,
+            return_length=False,
+        )
+        return {"input_ids": outputs["input_ids"], "attention_mask": outputs["attention_mask"]}
+
+    with accelerator.main_process_first():
+        dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)
+
+    # Simply pad to the multiple of 16 for both FP8 and BF16 precision
+    pad_to_multiple_of = 16
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer,
+        mlm=False,
+        pad_to_multiple_of=pad_to_multiple_of,
+    )
+
+    dataloader_params = {
+        "batch_size": hyperparams.batch_size,
+        "collate_fn": data_collator,
+        "drop_last": True,
+    }
+    train_dataloader = DataLoader(dataset, **dataloader_params)
+    return train_dataloader
+
+
+def init_baseline_model(hyperparams):
+    # Init the model
+    config = AutoConfig.from_pretrained(hyperparams.model_name)
+    # make sure to use flash_attention to do iso comparison with TEGemmaModel
+    config._attn_implementation = "flash_attention_2"
+    model = AutoModelForCausalLM.from_pretrained(
+        hyperparams.model_name,
+        config=config,
+        torch_dtype=torch.bfloat16,
+    )
+    return model.cuda()
+
+
+def init_te_gemma_model(hyperparams):
+    cls = TEGemmaForCausalLMCudaGraphs if hyperparams.generation_cuda_graphs else TEGemmaForCausalLM
+    config = AutoConfig.from_pretrained(hyperparams.model_name)
+    config._attn_implementation = "flash_attention_2"
+    # Adding all params from the hyperparams to the config to make the code simpler.
+    for key, value in hyperparams.__dict__.items():
+        setattr(config, key, value)
+    model = load_te_model(cls, config)
+    if hyperparams.generation_cuda_graphs:
+        model.record()
+    return model.cuda()
+
+
+def wrap_with_accelerator(model, hyperparams):
+    # Create FP8 kwarg handler if required
+    fp8_kwarg_handler = (
+        [FP8RecipeKwargs(backend="te")] if hyperparams.mixed_precision == "fp8" else None
+    )
+
+    # Init HF accelerator that's used for training
+    accelerator = Accelerator(
+        log_with="wandb",
+        gradient_accumulation_steps=hyperparams.gradient_accumulation_steps,
+        mixed_precision=hyperparams.mixed_precision,
+        kwargs_handlers=fp8_kwarg_handler,
+    )
+    # accelerator.print(f'State: {accelerator.state}')
+    train_dataloader = get_dataloaders(accelerator, hyperparams)
+
+    # Wrap model, optimizer/scheduler, dataloaders in accelerate
+    optimizer = AdamW(params=model.parameters(), lr=hyperparams.learning_rate, fused=True)
+    lr_scheduler = get_linear_schedule_with_warmup(
+        optimizer=optimizer,
+        num_warmup_steps=100,
+        num_training_steps=hyperparams.num_training_steps,
+    )
+    model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, lr_scheduler
+    )
+
+    return accelerator, model, optimizer, train_dataloader, lr_scheduler
+
+
+def finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer, lr_scheduler):
+    model.train()
+    optimizer.zero_grad()
+    train_dataloader = enumerate(train_dataloader)
+
+    def run_iters(num_iters):
+        for _ in range(num_iters):
+            _, batch = next(train_dataloader)
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                accelerator.backward(loss)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+    run_iters(hyperparams.num_warmup_steps)  # Warmup iters
+
+    # Get the timers ready
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    torch.cuda.synchronize()
+
+    start.record()
+    run_iters(hyperparams.num_training_steps)  # Training iters
+    torch.cuda.synchronize()
+    end.record()
+    accelerator.end_training()
+
+    print(
+        f"""{hyperparams.num_training_steps} finetuning steps complete!\n
+          Average time taken per step:
+          {(start.elapsed_time(end)/hyperparams.num_training_steps):.0f}
+          milliseconds"""
+    )
+
+
+def restart_jupyter_notebook():
+    # Try restarting the Jupyter kernel
+    IPython.Application.instance().kernel.do_shutdown(True)
+
+    # Check whether the device memory has been flushed
+    if torch.cuda.memory_allocated() != 0:
+        import warnings
+
+        warnings.warn("The device memory hasn't been flushed, trying with a second method!")
+
+        # Try restarting the Jupyter kernel another way
+        # Restart the kernel
+        from IPython.core.display import HTML
+
+        HTML("<script>Jupyter.notebook.kernel.restart()</script>")
+
+        if torch.cuda.memory_allocated() != 0:
+            print(
+                "The device memory hasn't been flushed, try manually restarting the Jupyter kernel!"
+            )
+
+    # Suppress the warnings
+    if not sys.warnoptions:
+        import warnings
+
+        warnings.simplefilter("ignore")
+        torch.set_warn_always(False)
+
+
+@torch.no_grad()
+def run_forward_pass(model, hyperparams, num_iters):
+    """
+    It runs num_iters forward passes with sample data.
+    """
+    accelerator = Accelerator(
+        log_with="wandb",
+        gradient_accumulation_steps=hyperparams.gradient_accumulation_steps,
+        mixed_precision="no",
+    )
+    train_dataloader = get_dataloaders(accelerator, hyperparams)
+
+    model.train()
+    train_dataloader = enumerate(train_dataloader)
+
+    for _ in range(num_iters):
+        _, batch = next(train_dataloader)
+        batch["input_ids"] = batch["input_ids"].cuda()
+        model(batch["input_ids"])
+
+
+"""
+    Benchmarking and example generation functions.
+"""
+
+
+def print_sample_of_generated_texts(model):
+    tokenizer = AutoTokenizer.from_pretrained(hyperparams.model_name)
+    prompts = ["Here are the two facts about GPUs:", "Some facts about NVIDIA:"]
+    inputs = tokenizer(prompts * 32, return_tensors="pt", padding=True)
+
+    max_length = inputs["input_ids"].size(1)
+    new_length = ((max_length + 63) // 64) * 128
+    inputs["input_ids"] = torch.nn.functional.pad(
+        inputs["input_ids"], (new_length - max_length, 0), value=tokenizer.pad_token_id
+    )
+    inputs["attention_mask"] = torch.nn.functional.pad(
+        inputs["attention_mask"], (new_length - max_length, 0), value=0
+    )
+
+    inputs["input_ids"] = inputs["input_ids"].cuda()
+    inputs["attention_mask"] = inputs["attention_mask"].cuda()
+
+    outputs = model.generate(**inputs, max_new_tokens=50)
+    generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+
+    print("=" * 30 + " Generation example 1 " + "=" * 30)
+    print("Prompt:")
+    print(generated_texts[0][: len(prompts[0])])
+    print("Generated text:")
+    print(generated_texts[0][len(prompts[0]) :])
+    print("=" * 30 + " Generation example 2 " + "=" * 30)
+    print("Prompt:")
+    print(generated_texts[1][: len(prompts[1])])
+    print("")
+    print("Generated text:")
+    print(generated_texts[1][len(prompts[1]) :])
+
+
+def _generate_random_words(num_words, max_word_length):
+    words = []
+    for _ in range(num_words):
+        word_length = random.randint(1, max_word_length)
+        word = "".join(random.choices(string.ascii_lowercase, k=word_length))
+        words.append(word)
+    return words
+
+
+def benchmark_generation(model):
+    batch_size = 64
+    context_length = 128
+    max_new_tokens = 1024 - 128
+    print("=" * 30 + " Benchmarking " + "=" * 30)
+    print(
+        f"Benchmarking for batch_size = {batch_size} and max total tokens ="
+        f" {context_length + max_new_tokens}"
+    )
+
+    input_str = _generate_random_words(batch_size, context_length)
+
+    tokenizer = AutoTokenizer.from_pretrained(hyperparams.model_name)
+    inputs = tokenizer(input_str, return_tensors="pt", padding=True)
+
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    torch.cuda.synchronize()
+    start.record()
+
+    model.generate(inputs["input_ids"].cuda(), max_new_tokens=max_new_tokens)
+    torch.cuda.synchronize()
+    end.record()
+
+    print(f"Time: {start.elapsed_time(end)/1000:.2f} s.")
diff --git a/docs/examples/te_llama/tutorial_accelerate_hf_llama_with_te.ipynb b/docs/examples/te_llama/tutorial_accelerate_hf_llama_finetuning_with_te.ipynb
similarity index 99%
rename from docs/examples/te_llama/tutorial_accelerate_hf_llama_with_te.ipynb
rename to docs/examples/te_llama/tutorial_accelerate_hf_llama_finetuning_with_te.ipynb
index 57c1bf6601..0d3ada8a12 100644
--- a/docs/examples/te_llama/tutorial_accelerate_hf_llama_with_te.ipynb
+++ b/docs/examples/te_llama/tutorial_accelerate_hf_llama_finetuning_with_te.ipynb
@@ -5,7 +5,7 @@
    "id": "6a5b2993",
    "metadata": {},
    "source": [
-    "# Accelerating a Hugging Face Llama 2 and Llama 3 models with Transformer Engine\n",
+    "# Accelerating a Hugging Face Llama 2 and Llama 3 models finetuning with Transformer Engine\n",
     "\n",
     "<div class=\"alert alert-info\">\n",
     "\n",
diff --git a/docs/index.rst b/docs/index.rst
index d64cebbfa2..316c2ded59 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -44,7 +44,9 @@ Transformer Engine documentation
 
    examples/fp8_primer.ipynb
    examples/advanced_optimizations.ipynb
-   examples/te_llama/tutorial_accelerate_hf_llama_with_te.ipynb
+   examples/te_llama/tutorial_accelerate_hf_llama_finetuning_with_te.ipynb
+   examples/te_gemma/tutorial_accelerate_hf_gemma_finetuning_with_te.ipynb
+   examples/te_gemma/tutorial_generation_gemma_with_te.ipynb
 
 .. toctree::
    :hidden:
diff --git a/qa/L0_pytorch_unittest/test.sh b/qa/L0_pytorch_unittest/test.sh
index 90c5e499f3..985f92cedd 100644
--- a/qa/L0_pytorch_unittest/test.sh
+++ b/qa/L0_pytorch_unittest/test.sh
@@ -22,5 +22,6 @@ pytest -v -s $TE_PATH/tests/pytorch/test_gqa.py
 pytest -v -s $TE_PATH/tests/pytorch/test_recipe.py
 pytest -v -s $TE_PATH/tests/pytorch/test_fused_optimizer.py
 pytest -v -s $TE_PATH/tests/pytorch/test_multi_tensor.py
+pytest -v -s $TE_PATH/tests/pytorch/test_generation.py
 pytest -v -s $TE_PATH/tests/pytorch/test_fusible_ops.py
 pytest -v -s $TE_PATH/tests/pytorch/test_fusible_ops_distributed.py
diff --git a/tests/pytorch/test_fused_rope.py b/tests/pytorch/test_fused_rope.py
index d6ba66cbbc..a2ce84293c 100644
--- a/tests/pytorch/test_fused_rope.py
+++ b/tests/pytorch/test_fused_rope.py
@@ -11,7 +11,7 @@
 
 
 def apply_rotary_pos_emb_thd(
-    t: torch.Tensor, cu_seqlens: torch.Tensor, freqs: torch.Tensor
+    t: torch.Tensor, cu_seqlens: torch.Tensor, freqs: torch.Tensor, start_positions: torch.Tensor
 ) -> torch.Tensor:
     """A baseline implementation of applying RoPE for `thd` format.
 
@@ -20,14 +20,106 @@ def apply_rotary_pos_emb_thd(
         cu_seqlens(Tensor):  Cumulative sum of sequence lengths in a batch for `t`,
         with shape [b + 1] and dtype torch.int32.
         freqs (Tensor): Rotary Positional embedding tensor freq is of shape [max_s, 1, 1, d]
+        start_positions (Tensor): Tensor of shape [b] determining the beginning offsets
+                         of frequeuncies applied to  sequences.
 
     Returns:
         Tensor: Shape [t, h, d]. The input tensor after applying RoPE.
     """
     seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
-    return torch.cat(
-        [apply_rotary_pos_emb(x.unsqueeze(1), freqs[: x.size(0)]) for x in torch.split(t, seqlens)]
-    ).squeeze(1)
+    if start_positions is None:
+        return torch.cat(
+            [
+                apply_rotary_pos_emb(x.unsqueeze(1), freqs[: x.size(0)])
+                for x in torch.split(t, seqlens)
+            ]
+        ).squeeze(1)
+    else:
+        return torch.cat(
+            [
+                apply_rotary_pos_emb(
+                    x.unsqueeze(1), freqs[start_positions[i] : (x.size(0) + start_positions[i])]
+                )
+                for i, x in enumerate(torch.split(t, seqlens))
+            ]
+        ).squeeze(1)
+
+
+def apply_rotary_pos_emb_with_start_positions(
+    t: torch.Tensor,
+    freqs: torch.Tensor,
+    tensor_format: str = "sbhd",
+    start_positions: Union[torch.Tensor, None] = None,
+) -> torch.Tensor:
+    """
+    Apply rotary positional embedding tensor to the input tensor.
+    This is non-fused version which supports start_positions parameters.
+    Non-fused implementation with start_positions is slow, thus it is not included in the
+    Transformer Engine directly.
+
+    Parameters
+    ----------
+    t: torch.Tensor
+        Input tensor of shape `[s, b, h, d]`, `[b, s, h, d]` or `[t, h, d]`, on which
+        rotary positional embedding will be applied.
+    freqs: torch.Tensor
+        Rotary positional embedding tensor of shape `[s2, 1, 1, d2]` and dtype 'float',
+        with `s2 >= s` and `d2 <= d`.
+    tensor_format: {'sbhd', 'bshd'}, default = 'sbhd'
+    start_positions: torch.Tensor, default = None.
+        We may not want begin all the sequences from the 0 embedding.
+        This tensor argument allows that.
+    """
+
+    def _rotate_half(x: torch.Tensor) -> torch.Tensor:
+        """
+        change sign so the last dimension becomes [-odd, +even]
+        """
+        x = x.view(x.shape[:-1] + torch.Size((2, x.shape[-1] // 2)))
+        x1, x2 = x.unbind(dim=-2)
+        return torch.cat((-x2, x1), dim=-1)
+
+    if start_positions is None:
+        return apply_rotary_pos_emb(t, freqs, tensor_format=tensor_format)
+
+    max_seq_len = freqs.shape[0]
+    cur_seq_len = t.shape[1] if tensor_format == "bshd" else t.shape[0]
+
+    # Only apply the rotary embeddings up to the sequence length of the running
+    # input.
+    assert (
+        cur_seq_len <= max_seq_len
+    ), f"Rotary Embeddings only supported up to {max_seq_len} sequence length!"
+
+    if tensor_format == "bshd":
+        t = t.transpose(0, 1)
+    # cos/sin first then dtype conversion for better precision
+    cos_ = torch.cos(freqs).to(t.dtype)
+    sin_ = torch.sin(freqs).to(t.dtype)
+
+    rot_dim = freqs.shape[-1]
+    # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
+    t, t_pass = t[..., :rot_dim], t[..., rot_dim:]
+
+    # shifted_sin, shifted_cos will have the same shape as t. They will contain
+    # scaling factors shifted for each sequence by the corresponding start_positions offset.
+
+    shifted_sin = sin_[:cur_seq_len].expand(t.shape).clone()
+    shifted_cos = cos_[:cur_seq_len].expand(t.shape).clone()
+
+    for b in range(start_positions.shape[0]):
+        assert max_seq_len >= start_positions[b]
+        shifted_freq = slice(start_positions[b], (start_positions[b] + cur_seq_len))
+        shifted_sin[:, b, :] = sin_[shifted_freq, 0, ...]
+        shifted_cos[:, b, :] = cos_[shifted_freq, 0, ...]
+
+    t = (t * shifted_cos) + (_rotate_half(t) * shifted_sin)
+    out = torch.cat((t, t_pass), dim=-1)
+
+    if tensor_format == "bshd":
+        out = out.transpose(0, 1).contiguous()
+
+    return out
 
 
 def get_tol(dtype: torch.dtype) -> Dict:
@@ -54,8 +146,9 @@ def _non_overlapping_grad(output: torch.Tensor) -> torch.Tensor:
 @pytest.mark.parametrize("hidden_size", [128, 256])
 @pytest.mark.parametrize("rotary_percent", [0.5, 1.0])
 @pytest.mark.parametrize("margin", [0, 10])
+@pytest.mark.parametrize("start_positions", [True, False])
 @pytest.mark.parametrize("transpose", [None, (0, 1), (2, 3)])
-@pytest.mark.parametrize("tensor_format", ["sbhd", "bshd"])
+@pytest.mark.parametrize("tensor_format", ["bshd", "sbhd"])
 @pytest.mark.parametrize("loss_func", [_overlapping_grad, _non_overlapping_grad])
 def test_fused_rope(
     dtype: torch.dtype,
@@ -63,6 +156,7 @@ def test_fused_rope(
     hidden_size: int,
     rotary_percent: float,
     margin: int,
+    start_positions: bool,
     transpose: Union[Tuple, None],
     tensor_format: str,
     loss_func: Callable,
@@ -80,11 +174,24 @@ def test_fused_rope(
         t = t.transpose(*transpose).contiguous().transpose(*transpose)
     t.requires_grad = True
 
+    if margin == 0 and start_positions == True:
+        # If sequence to encode has the same length as length of encoding
+        # there is no space left for starting with positions >0.
+        pytest.skip("Skipping test with margin=0 and start_positions=True")
+
+    start_positions = (
+        torch.randint(0, margin, (batch_size,), dtype=torch.int32, device=device)
+        if start_positions
+        else None
+    )
+
     rotary_pos_emb = RotaryPositionEmbedding(hidden_size, rotary_percent)
     emb = rotary_pos_emb(seq_length)
 
     # unfused
-    output_unfused = apply_rotary_pos_emb(t, emb, tensor_format=tensor_format, fused=False)
+    output_unfused = apply_rotary_pos_emb_with_start_positions(
+        t, emb, tensor_format=tensor_format, start_positions=start_positions
+    )
     loss_unfused = loss_func(output_unfused)
     loss_unfused.backward()
     grad_unfused = t.grad.detach().clone()
@@ -92,10 +199,7 @@ def test_fused_rope(
 
     # fused
     output_fused = apply_rotary_pos_emb(
-        t,
-        emb,
-        tensor_format=tensor_format,
-        fused=True,
+        t, emb, tensor_format=tensor_format, fused=True, start_positions=start_positions
     )
     loss_fused = loss_func(output_fused)
     loss_fused.backward()
@@ -112,12 +216,14 @@ def test_fused_rope(
 @pytest.mark.parametrize("rotary_percent", [0.5, 1.0])
 @pytest.mark.parametrize("transpose", [None, (1, 2)])
 @pytest.mark.parametrize("loss_func", [_overlapping_grad, _non_overlapping_grad])
+@pytest.mark.parametrize("start_positions", [True, False])
 def test_fused_rope_thd(
     dtype: torch.dtype,
     hidden_size: int,
     rotary_percent: float,
     transpose: Union[Tuple, None],
     loss_func: Callable,
+    start_positions: bool,
 ) -> None:
     device = torch.device("cuda:0")
     batch_size, head_num = 2, 64
@@ -135,11 +241,17 @@ def test_fused_rope_thd(
         t = t.transpose(*transpose).contiguous().transpose(*transpose)
     t.requires_grad = True
 
+    start_positions = (
+        torch.randint(0, 20, (cu_seqlens.shape[-1],), dtype=torch.int32, device=device)
+        if start_positions
+        else None
+    )
+
     rotary_pos_emb = RotaryPositionEmbedding(hidden_size, rotary_percent)
     emb = rotary_pos_emb(cu_seqlens[-1])
 
     # unfused
-    output_unfused = apply_rotary_pos_emb_thd(t, cu_seqlens, emb)
+    output_unfused = apply_rotary_pos_emb_thd(t, cu_seqlens, emb, start_positions=start_positions)
     loss_unfused = loss_func(output_unfused)
     loss_unfused.backward()
     grad_unfused = t.grad.detach().clone()
@@ -147,7 +259,12 @@ def test_fused_rope_thd(
 
     # fused
     output_fused = apply_rotary_pos_emb(
-        t, emb, fused=True, tensor_format="thd", cu_seqlens=cu_seqlens
+        t,
+        emb,
+        fused=True,
+        tensor_format="thd",
+        cu_seqlens=cu_seqlens,
+        start_positions=start_positions,
     )
     loss_fused = loss_func(output_fused)
     loss_fused.backward()
diff --git a/tests/pytorch/test_generation.py b/tests/pytorch/test_generation.py
new file mode 100644
index 0000000000..343dd4db1d
--- /dev/null
+++ b/tests/pytorch/test_generation.py
@@ -0,0 +1,210 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import pytest
+import torch
+
+import transformer_engine.pytorch as te
+
+
+class TestInferenceParams:
+    def test_setup_before_new_input_bshd(self):
+        inference_params = te.attention.InferenceParams(64, 128, qkv_format="bshd")
+
+        inference_params.setup_before_new_input(length=16)
+        # Offset before first sequence is equal to 0.
+        assert inference_params.sequence_len_offset == 0
+
+        # Offset before second sequence is equal to 16.
+        inference_params.setup_before_new_input(length=4)
+        assert inference_params.sequence_len_offset == 16
+
+    def test_setup_before_new_input_thd(self):
+        inference_params = te.attention.InferenceParams(4, 128, qkv_format="thd")
+
+        inference_params.setup_before_new_input(
+            lengths_tensor=torch.Tensor([1, 0, 2, 4]).cuda(), max_input_length=20
+        )
+
+        assert torch.equal(
+            inference_params.cached_sequence_lengths, torch.Tensor([0, 0, 0, 0]).cuda()
+        )
+        assert torch.equal(
+            inference_params.input_sequence_lengths, torch.Tensor([1, 0, 2, 4]).cuda()
+        )
+        assert inference_params.max_incoming_seq_len == 20
+
+        inference_params.setup_before_new_input(
+            lengths_tensor=torch.Tensor([2, 3, 5, 1]).cuda(), max_input_length=10
+        )
+        assert torch.equal(
+            inference_params.cached_sequence_lengths, torch.Tensor([1, 0, 2, 4]).cuda()
+        )
+        assert torch.equal(
+            inference_params.input_sequence_lengths, torch.Tensor([2, 3, 5, 1]).cuda()
+        )
+        assert inference_params.max_incoming_seq_len == 10
+
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16, torch.float16])
+    @pytest.mark.parametrize("batch_size", [64, 128, 256])
+    @pytest.mark.parametrize("max_seq_len", [128, 256, 512])
+    @pytest.mark.parametrize("max_input_len", [32, 128])
+    def test_save_to_kv_cache_thd(self, batch_size, max_seq_len, max_input_len, dtype):
+        h, d = 16, 256
+
+        inference_params = te.attention.InferenceParams(batch_size, max_seq_len, qkv_format="thd")
+        inference_params.allocate_memory_for_kv_cache_if_empty(1, h, d, dtype)
+
+        t = batch_size * max_input_len
+        key_layer = torch.randn((t, h, d)).cuda().to(dtype)
+        value_layer = torch.randn((t, h, d)).cuda().to(dtype)
+
+        sequence_lengths = [1, 2] * (batch_size // 2)
+
+        # We save the same sequences two time, which should result in sequences of lentgh 2 and 4
+        # in the cache
+        inference_params.reset()
+        inference_params.setup_before_new_input(
+            lengths_tensor=torch.tensor(sequence_lengths).cuda(), max_input_length=max_input_len
+        )
+        inference_params.save_to_kv_cache(1, key_layer, value_layer)
+
+        inference_params.setup_before_new_input(
+            lengths_tensor=torch.tensor(sequence_lengths).cuda(), max_input_length=max_input_len
+        )
+        inference_params.save_to_kv_cache(1, key_layer, value_layer)
+
+        key_memory, value_memory = inference_params.key_value_memory_dict[1]
+
+        # Chcek whether the sequences were copied properly.
+
+        def check(memory, layer, b, idx1, idx2):
+            # Check if sequence idx in batch b in memory corresponds
+            # to the sequence idx2 in batch b in layer.
+            assert torch.equal(memory[b * max_seq_len + idx1], layer[b * max_input_len + idx2, :])
+
+        # even indices
+        for b in range(0, batch_size, 2):
+            check(key_memory, key_layer, b, 0, 0)
+            check(key_memory, key_layer, b, 1, 0)
+            assert (key_memory[b * max_seq_len + 2 : ((b + 1) * max_seq_len)] == 0).all()
+
+            check(value_memory, value_layer, b, 0, 0)
+            check(value_memory, value_layer, b, 1, 0)
+            assert (value_memory[b * max_seq_len + 2 : ((b + 1) * max_seq_len)] == 0).all()
+
+        # odd indices
+        for b in range(1, batch_size, 2):
+            check(key_memory, key_layer, b, 0, 0)
+            check(key_memory, key_layer, b, 1, 1)
+            check(key_memory, key_layer, b, 2, 0)
+            check(key_memory, key_layer, b, 3, 1)
+            assert (key_memory[b * max_seq_len + 4 : ((b + 1) * max_seq_len)] == 0).all()
+
+            check(value_memory, value_layer, b, 0, 0)
+            check(value_memory, value_layer, b, 1, 1)
+            check(value_memory, value_layer, b, 2, 0)
+            check(value_memory, value_layer, b, 3, 1)
+            assert (value_memory[b * max_seq_len + 4 : ((b + 1) * max_seq_len)] == 0).all()
+
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16, torch.float16])
+    @pytest.mark.parametrize("batch_size", [64, 128, 256])
+    @pytest.mark.parametrize("max_seq_len", [128, 256, 512])
+    def test_save_to_kv_cache_bshd(self, batch_size, max_seq_len, dtype):
+        # This test checks if key_layer and value_layer are copied to cache.
+        # Cache size is equal to the size of one key/value layer.
+        h, d = 16, 256
+
+        inference_params = te.attention.InferenceParams(batch_size, max_seq_len, qkv_format="bshd")
+
+        inference_params.allocate_memory_for_kv_cache_if_empty(1, h, d, dtype)
+        key_layer = torch.randn((max_seq_len, batch_size, h, d)).cuda().to(dtype)
+        value_layer = torch.randn((max_seq_len, batch_size, h, d)).cuda().to(dtype)
+
+        inference_params.setup_before_new_input(length=0)
+        inference_params.save_to_kv_cache(1, key_layer, value_layer)
+
+        key_memory, value_memory = inference_params.key_value_memory_dict[1]
+
+        assert torch.equal(key_memory, key_layer)
+        assert torch.equal(value_memory, value_layer)
+
+    @pytest.mark.parametrize("layer_number", [1, 100])
+    @pytest.mark.parametrize("batch_size", [1, 128])
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16, torch.float16])
+    def test_allocate_memory_for_kv_cache_if_empty(self, layer_number, batch_size, dtype):
+        nr_heads = 16
+        head_dim = 256
+        max_sequence_len = 128
+        inference_params = te.attention.InferenceParams(
+            batch_size, max_sequence_len, qkv_format="bshd"
+        )
+
+        assert layer_number not in inference_params.key_value_memory_dict
+
+        inference_params.allocate_memory_for_kv_cache_if_empty(
+            layer_number, nr_heads, head_dim, dtype
+        )
+
+        key_memory, value_memory = inference_params.key_value_memory_dict[layer_number]
+
+        assert key_memory.shape == (max_sequence_len, batch_size, nr_heads, head_dim)
+        assert value_memory.shape == (max_sequence_len, batch_size, nr_heads, head_dim)
+
+        # Should not allocate new buffers.
+        inference_params.allocate_memory_for_kv_cache_if_empty(layer_number, 100, 100, dtype)
+
+        assert key_memory.shape == (max_sequence_len, batch_size, nr_heads, head_dim)
+        assert value_memory.shape == (max_sequence_len, batch_size, nr_heads, head_dim)
+
+    def test_set_params_to_thd_attention(self):
+        # This test check whether parameteres needed to run thd attention
+        # are computed correcly. This parameters are passed to the fused_attn_fwd(..)
+        # to indicate which parts of the key/query/value layers are sequences and
+        # which of them are offsets.
+        batch_size = 4
+        channels = 1024
+        max_sequence_len = 128
+        max_input_len = 20
+        inference_params = te.attention.InferenceParams(
+            batch_size, max_sequence_len, qkv_format="thd"
+        )
+
+        inference_params.setup_before_new_input(
+            lengths_tensor=torch.Tensor([1, 1, 1, 1]).cuda(), max_input_length=max_input_len
+        )
+        inference_params.setup_before_new_input(
+            lengths_tensor=torch.Tensor([1, 0, 2, 4]).cuda(), max_input_length=max_input_len
+        )
+
+        buffers = [torch.zeros(batch_size + 1, dtype=torch.int32, device="cuda") for _ in range(6)]
+        max_q_len, max_kv_len, buffers = inference_params.set_params_to_thd_attention(
+            buffers, channels
+        )
+
+        cu_seqlens_q, cu_seqlens_kv, seq_offsets_q, seq_offsets_k, seq_offsets_v, seq_offsets_o = (
+            buffers
+        )
+
+        assert max_q_len == max_input_len
+        assert max_kv_len == max_sequence_len
+        assert torch.equal(cu_seqlens_q, torch.tensor([0, 1, 1, 3, 7]).cuda())
+        assert torch.equal(cu_seqlens_kv, torch.tensor([0, 2, 3, 6, 11]).cuda())
+
+        assert torch.equal(
+            seq_offsets_q,
+            torch.tensor([k * max_input_len * channels for k in range(batch_size + 1)]).cuda(),
+        )
+        assert torch.equal(
+            seq_offsets_k,
+            torch.tensor([k * max_sequence_len * channels for k in range(batch_size + 1)]).cuda(),
+        )
+        assert torch.equal(
+            seq_offsets_v,
+            torch.tensor([k * max_sequence_len * channels for k in range(batch_size + 1)]).cuda(),
+        )
+        assert torch.equal(
+            seq_offsets_o,
+            torch.tensor([k * max_input_len * channels for k in range(batch_size + 1)]).cuda(),
+        )
diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py
index 7eed97a0ca..8e20957384 100644
--- a/tests/pytorch/test_numerics.py
+++ b/tests/pytorch/test_numerics.py
@@ -3,8 +3,9 @@
 # See LICENSE for license information.
 
 import math
+import functools
 import os
-from typing import Dict, List, Optional
+from typing import Dict, List, Tuple, Optional
 import pytest
 import copy
 
@@ -12,6 +13,8 @@
 import torch.nn as nn
 from torch.nn import Parameter
 
+import transformer_engine.pytorch.cpp_extensions as ext
+
 from transformer_engine.pytorch.fp8 import fp8_autocast, FP8GlobalStateManager, fp8_model_init
 from transformer_engine.pytorch.utils import (
     init_method_normal,
@@ -40,6 +43,22 @@
 fp8_available, reason_for_no_fp8 = FP8GlobalStateManager.is_fp8_available()
 
 
+@functools.cache
+def _cudnn_version() -> Tuple[int, int, int]:
+    """Runtime cuDNN version (major, minor, patch)"""
+    encoded_version = ext.get_cudnn_version()
+    major_version_magnitude = 1000 if encoded_version < 90000 else 10000
+    major, encoded_version = divmod(encoded_version, major_version_magnitude)
+    minor, patch = divmod(encoded_version, 100)
+    return (major, minor, patch)
+
+
+def get_device_compute_capability() -> Tuple[int, int]:
+    """CUDA compute capability of current GPU"""
+    props = torch.cuda.get_device_properties(torch.cuda.current_device())
+    return (props.major, props.minor)
+
+
 seed = 1234
 torch.manual_seed(seed)
 torch.cuda.manual_seed(seed)
@@ -1682,6 +1701,139 @@ def test_kv_cache_accuracy(dtype, bs, model_key, use_RoPE, input_format, module,
     assert_allclose(full_output, incremental_output, atol[dtype])
 
 
+@pytest.mark.parametrize("dtype", param_types)
+@pytest.mark.parametrize("bs", batch_sizes)
+@pytest.mark.parametrize("model_key", model_configs_inference.keys())
+@pytest.mark.parametrize("use_RoPE", all_boolean)
+@pytest.mark.parametrize("module", module_inference)
+@pytest.mark.skipif(
+    get_device_compute_capability() < (9, 0), reason="THD is only supported on Hopper+."
+)
+@pytest.mark.skipif(_cudnn_version() < (9, 0, 0), reason="cuDNN 9.0.0+ is required.")
+def test_kv_cache_accuracy_thd(dtype, bs, model_key, use_RoPE, module):
+    """
+    In thd attention sequences can have various lengths,
+    different that 's' dimension of input to the Transformer Layer.
+
+    The test contains of:
+    - one context phase when sequences with various lengths(!) are passed through the model,
+    - 2 phases when sequences with length 1 are passed through the model.
+
+    The output is compared with the case when all this sequences are passed at one.
+    """
+    if dtype == torch.float32:
+        pytest.skip("torch.float32 does not support thd")
+
+    fused_attn_env = os.environ["NVTE_FUSED_ATTN"]
+    os.environ["NVTE_FUSED_ATTN"] = "1"  # Only fused attention supports thd.
+
+    if not fp8_available:
+        pytest.skip(reason_for_no_fp8)
+
+    config = model_configs_inference[model_key]
+
+    S = config.seq_len
+    B = bs
+    H = config.num_attention_heads
+    D = config.hidden_size
+    G = 2  # generation phase length
+    S_max = S + G
+    head_size = config.embed
+
+    layer_number = 1
+    rotary_freqs = torch.randn((S_max, 1, 1, head_size), dtype=torch.float, device="cuda")
+
+    # Tensors have shapes [b, s, h, d] and the seqlens are the tensor of shapes [b]
+    # which indicate the length of sequences - sequences starts from the begining.
+    # This function copies sequences from tensor into dst_tensor.
+    # dst_tensor should be big enough to fit this sequences.
+    def _concat_thd(dst_tensor, dst_seqlens, tensor, seqlens):
+        for b in range(B):
+            dst_tensor[b, dst_seqlens[b] : (dst_seqlens[b] + seqlens[b]), :] = tensor[
+                b, : seqlens[b], :
+            ]
+        dst_seqlens.copy_(dst_seqlens + seqlens)
+
+    if module == "TransformerLayer":
+        model = TransformerLayer(
+            hidden_size=D,
+            ffn_hidden_size=4 * D,
+            num_attention_heads=H,
+            attn_input_format="thd",
+            self_attn_mask_type="padding_causal",
+            layer_number=layer_number,
+            params_dtype=dtype,
+            device="cuda",
+        ).eval()
+        attn_name = "self_attn_mask_type"
+    else:
+        model = (
+            MultiheadAttention(
+                hidden_size=D,
+                num_attention_heads=H,
+                qkv_format="thd",
+                layer_number=layer_number,
+                params_dtype=dtype,
+                attn_mask_type="padding_causal",
+            )
+            .cuda()
+            .eval()
+        )
+        attn_name = "attn_mask_type"
+
+    inference_params = InferenceParams(B, S_max, qkv_format="thd")
+
+    kwargs = {
+        "inference_params": inference_params,
+        "rotary_pos_emb": rotary_freqs if use_RoPE else None,
+    }
+
+    total_sequence_lengths = torch.zeros((B,)).cuda().to(torch.int32)
+    total_tensor = torch.zeros((B, S_max, D)).cuda().to(dtype)
+
+    # Sequences split into chunks.
+
+    # context phase
+    sequence_lengths = torch.randint(1, S, (B,)).cuda().to(torch.int32)
+    chunk = torch.randn((B, S, D)).cuda().to(dtype)
+    inference_params.setup_before_new_input(max_input_length=S, lengths_tensor=sequence_lengths)
+    model(
+        chunk, inference_params=inference_params, rotary_pos_emb=rotary_freqs if use_RoPE else None
+    )
+    _concat_thd(total_tensor, total_sequence_lengths, chunk, sequence_lengths)
+
+    # generation phase
+    for _ in range(G):
+        sequence_lengths = torch.ones((B,)).cuda().to(torch.int32)
+        chunk = torch.randn((B, 1, D)).cuda().to(dtype)
+        inference_params.setup_before_new_input(max_input_length=1, lengths_tensor=sequence_lengths)
+        # we need to remove 'causal' from mask
+        # otherwise tokens we add will be considered as a first in the sequence,
+        # but they need to interact with all tokens from key-value cache.
+        # after removing this line, tests should fail
+        kwargs[attn_name] = "padding"
+        output = model(chunk, **kwargs)
+        _concat_thd(total_tensor, total_sequence_lengths, chunk, sequence_lengths)
+    incremental_logits = output[:, -1, :]  # last element of each seq.
+
+    # Sequences passed in one, concatenated chunk.
+
+    kwargs[attn_name] = "padding_causal"  # add 'causal' back to the mask
+    inference_params.reset()
+    inference_params.setup_before_new_input(
+        max_input_length=S_max, lengths_tensor=total_sequence_lengths
+    )
+    full_output = model(total_tensor, **kwargs)
+    full_logits = full_output[
+        torch.arange(0, B), total_sequence_lengths - 1, :
+    ]  # last element of each seq.
+
+    # Final result should be close.
+    torch.testing.assert_close(full_logits, incremental_logits, atol=1e-2, rtol=1e-2)
+
+    os.environ["NVTE_FUSED_ATTN"] = fused_attn_env
+
+
 @pytest.mark.parametrize(
     "shape",
     [
diff --git a/transformer_engine/common/fused_rope/fused_rope.cu b/transformer_engine/common/fused_rope/fused_rope.cu
index e7cf940a57..560b7b55d3 100644
--- a/transformer_engine/common/fused_rope/fused_rope.cu
+++ b/transformer_engine/common/fused_rope/fused_rope.cu
@@ -15,11 +15,11 @@ namespace transformer_engine {
 
 template <typename scalar_t>
 __device__ void fused_rope_block_forward(const scalar_t *src, const float *freqs, scalar_t *dst,
-                                         const int offset_block, const int offset_block_dst,
-                                         const int h, const int d, const int d2, const int stride_h,
-                                         const int stride_d, const int o_stride_h,
-                                         const int o_stride_d) {
-  int s_id = blockIdx.x;
+                                         const int begin_offset, const int offset_block,
+                                         const int offset_block_dst, const int h, const int d,
+                                         const int d2, const int stride_h, const int stride_d,
+                                         const int o_stride_h, const int o_stride_d) {
+  int s_id = blockIdx.x + begin_offset;
 #pragma unroll
   for (int d_id = threadIdx.x; d_id < d2; d_id += blockDim.x) {
     float v_cos, v_sin;
@@ -52,11 +52,11 @@ __device__ void fused_rope_block_forward(const scalar_t *src, const float *freqs
 
 template <typename scalar_t>
 __device__ void fused_rope_block_backward(const scalar_t *src, const float *freqs, scalar_t *dst,
-                                          const int offset_block, const int offset_block_dst,
-                                          const int h, const int d, const int d2,
-                                          const int stride_h, const int stride_d,
+                                          const int begin_offset, const int offset_block,
+                                          const int offset_block_dst, const int h, const int d,
+                                          const int d2, const int stride_h, const int stride_d,
                                           const int o_stride_h, const int o_stride_d) {
-  int s_id = blockIdx.x;
+  int s_id = blockIdx.x + begin_offset;
 #pragma unroll
   for (int d_id = threadIdx.x; d_id < d2; d_id += blockDim.x) {
     float v_cos = cosf(freqs[s_id * d2 + d_id]);
@@ -88,68 +88,75 @@ __device__ void fused_rope_block_backward(const scalar_t *src, const float *freq
 }
 
 template <typename scalar_t>
-__global__ void fused_rope_forward_kernel(const scalar_t *src, const float *freqs, scalar_t *dst,
-                                          const int h, const int d, const int d2,
-                                          const int stride_s, const int stride_b,
-                                          const int stride_h, const int stride_d,
-                                          const int o_stride_s, const int o_stride_b,
-                                          const int o_stride_h, const int o_stride_d) {
+__global__ void fused_rope_forward_kernel(const scalar_t *src, const float *freqs,
+                                          const int *start_positions, scalar_t *dst, const int h,
+                                          const int d, const int d2, const int stride_s,
+                                          const int stride_b, const int stride_h,
+                                          const int stride_d, const int o_stride_s,
+                                          const int o_stride_b, const int o_stride_h,
+                                          const int o_stride_d) {
   int s_id = blockIdx.x, b_id = blockIdx.y;
+  int begin_offset = (start_positions == 0) ? 0 : start_positions[b_id];
   int offset_block = s_id * stride_s + b_id * stride_b;
   int offset_block_dst = s_id * o_stride_s + b_id * o_stride_b;
-  fused_rope_block_forward(src, freqs, dst, offset_block, offset_block_dst, h, d, d2, stride_h,
-                           stride_d, o_stride_h, o_stride_d);
+  fused_rope_block_forward(src, freqs, dst, begin_offset, offset_block, offset_block_dst, h, d, d2,
+                           stride_h, stride_d, o_stride_h, o_stride_d);
 }
 
 template <typename scalar_t>
-__global__ void fused_rope_backward_kernel(const scalar_t *src, const float *freqs, scalar_t *dst,
-                                           const int h, const int d, const int d2,
-                                           const int stride_s, const int stride_b,
-                                           const int stride_h, const int stride_d,
-                                           const int o_stride_s, const int o_stride_b,
-                                           const int o_stride_h, const int o_stride_d) {
+__global__ void fused_rope_backward_kernel(const scalar_t *src, const float *freqs,
+                                           const int *start_positions, scalar_t *dst, const int h,
+                                           const int d, const int d2, const int stride_s,
+                                           const int stride_b, const int stride_h,
+                                           const int stride_d, const int o_stride_s,
+                                           const int o_stride_b, const int o_stride_h,
+                                           const int o_stride_d) {
   int s_id = blockIdx.x, b_id = blockIdx.y;
+  int begin_offset = (start_positions == 0) ? 0 : start_positions[b_id];
   int offset_block = s_id * stride_s + b_id * stride_b;
   int offset_block_dst = s_id * o_stride_s + b_id * o_stride_b;
-  fused_rope_block_backward(src, freqs, dst, offset_block, offset_block_dst, h, d, d2, stride_h,
-                            stride_d, o_stride_h, o_stride_d);
+  fused_rope_block_backward(src, freqs, dst, begin_offset, offset_block, offset_block_dst, h, d, d2,
+                            stride_h, stride_d, o_stride_h, o_stride_d);
 }
 
 template <typename scalar_t>
 __global__ void fused_rope_thd_forward_kernel(const scalar_t *src, const int *cu_seqlens,
-                                              const float *freqs, scalar_t *dst, const int h,
-                                              const int d, const int d2, const int stride_t,
-                                              const int stride_h, const int stride_d,
-                                              const int o_stride_t, const int o_stride_h,
-                                              const int o_stride_d) {
+                                              const float *freqs, const int *start_positions,
+                                              scalar_t *dst, const int h, const int d, const int d2,
+                                              const int stride_t, const int stride_h,
+                                              const int stride_d, const int o_stride_t,
+                                              const int o_stride_h, const int o_stride_d) {
   int s_id = blockIdx.x, b_id = blockIdx.y;
   int t_id = s_id + cu_seqlens[b_id];
   if (t_id >= cu_seqlens[b_id + 1]) return;
   int offset_block = t_id * stride_t;
   int offset_block_dst = t_id * o_stride_t;
-  fused_rope_block_forward(src, freqs, dst, offset_block, offset_block_dst, h, d, d2, stride_h,
-                           stride_d, o_stride_h, o_stride_d);
+  int begin_offset = (start_positions == 0) ? 0 : start_positions[b_id];
+  fused_rope_block_forward(src, freqs, dst, begin_offset, offset_block, offset_block_dst, h, d, d2,
+                           stride_h, stride_d, o_stride_h, o_stride_d);
 }
 
 template <typename scalar_t>
 __global__ void fused_rope_thd_backward_kernel(const scalar_t *src, const int *cu_seqlens,
-                                               const float *freqs, scalar_t *dst, const int h,
-                                               const int d, const int d2, const int stride_t,
-                                               const int stride_h, const int stride_d,
-                                               const int o_stride_t, const int o_stride_h,
-                                               const int o_stride_d) {
+                                               const float *freqs, const int *start_positions,
+                                               scalar_t *dst, const int h, const int d,
+                                               const int d2, const int stride_t, const int stride_h,
+                                               const int stride_d, const int o_stride_t,
+                                               const int o_stride_h, const int o_stride_d) {
   int s_id = blockIdx.x, b_id = blockIdx.y;
   int t_id = s_id + cu_seqlens[b_id];
   if (t_id >= cu_seqlens[b_id + 1]) return;
   int offset_block = t_id * stride_t;
   int offset_block_dst = t_id * o_stride_t;
-  fused_rope_block_backward(src, freqs, dst, offset_block, offset_block_dst, h, d, d2, stride_h,
-                            stride_d, o_stride_h, o_stride_d);
+  int begin_offset = (start_positions == 0) ? 0 : start_positions[b_id];
+  fused_rope_block_backward(src, freqs, dst, begin_offset, offset_block, offset_block_dst, h, d, d2,
+                            stride_h, stride_d, o_stride_h, o_stride_d);
 }
 
 template <typename scalar_t>
-void fused_rope_forward_launcher(const scalar_t *input, const float *freqs, scalar_t *output,
-                                 const int s, const int b, const int h, const int d, const int d2,
+void fused_rope_forward_launcher(const scalar_t *input, const float *freqs,
+                                 const int *start_positions, scalar_t *output, const int s,
+                                 const int b, const int h, const int d, const int d2,
                                  const int stride_s, const int stride_b, const int stride_h,
                                  const int stride_d, const int o_stride_s, const int o_stride_b,
                                  const int o_stride_h, const int o_stride_d, cudaStream_t stream) {
@@ -158,115 +165,123 @@ void fused_rope_forward_launcher(const scalar_t *input, const float *freqs, scal
   dim3 threads(THREADS_PER_WARP, warps_per_block);
 
   fused_rope_forward_kernel<<<blocks, threads, 0, stream>>>(
-      input, freqs, output, h, d, d2, stride_s, stride_b, stride_h, stride_d, o_stride_s,
-      o_stride_b, o_stride_h, o_stride_d);
+      input, freqs, start_positions, output, h, d, d2, stride_s, stride_b, stride_h, stride_d,
+      o_stride_s, o_stride_b, o_stride_h, o_stride_d);
   NVTE_CHECK_CUDA(cudaGetLastError());
 }
 
 template <typename scalar_t>
 void fused_rope_backward_launcher(const scalar_t *output_grads, const float *freqs,
-                                  scalar_t *input_grads, const int s, const int b, const int h,
-                                  const int d, const int d2, const int stride_s, const int stride_b,
-                                  const int stride_h, const int stride_d, const int o_stride_s,
-                                  const int o_stride_b, const int o_stride_h, const int o_stride_d,
-                                  cudaStream_t stream) {
+                                  const int *start_positions, scalar_t *input_grads, const int s,
+                                  const int b, const int h, const int d, const int d2,
+                                  const int stride_s, const int stride_b, const int stride_h,
+                                  const int stride_d, const int o_stride_s, const int o_stride_b,
+                                  const int o_stride_h, const int o_stride_d, cudaStream_t stream) {
   int warps_per_block = h < 16 ? 4 : 8;
   dim3 blocks(s, b);
   dim3 threads(THREADS_PER_WARP, warps_per_block);
 
   fused_rope_backward_kernel<<<blocks, threads, 0, stream>>>(
-      output_grads, freqs, input_grads, h, d, d2, stride_s, stride_b, stride_h, stride_d,
-      o_stride_s, o_stride_b, o_stride_h, o_stride_d);
+      output_grads, freqs, start_positions, input_grads, h, d, d2, stride_s, stride_b, stride_h,
+      stride_d, o_stride_s, o_stride_b, o_stride_h, o_stride_d);
   NVTE_CHECK_CUDA(cudaGetLastError());
 }
 
 template <typename scalar_t>
 void fused_rope_thd_forward_launcher(const scalar_t *input, const int *cu_seqlens,
-                                     const float *freqs, scalar_t *output, const int max_s,
-                                     const int b, const int h, const int d, const int d2,
-                                     const int stride_t, const int stride_h, const int stride_d,
-                                     const int o_stride_t, const int o_stride_h,
-                                     const int o_stride_d, cudaStream_t stream) {
+                                     const float *freqs, const int *start_positions,
+                                     scalar_t *output, const int max_s, const int b, const int h,
+                                     const int d, const int d2, const int stride_t,
+                                     const int stride_h, const int stride_d, const int o_stride_t,
+                                     const int o_stride_h, const int o_stride_d,
+                                     cudaStream_t stream) {
   int warps_per_block = h < 16 ? 4 : 8;
   dim3 blocks(max_s, b);
   dim3 threads(THREADS_PER_WARP, warps_per_block);
 
-  fused_rope_thd_forward_kernel<<<blocks, threads, 0, stream>>>(input, cu_seqlens, freqs, output, h,
-                                                                d, d2, stride_t, stride_h, stride_d,
-                                                                o_stride_t, o_stride_h, o_stride_d);
+  fused_rope_thd_forward_kernel<<<blocks, threads, 0, stream>>>(
+      input, cu_seqlens, freqs, start_positions, output, h, d, d2, stride_t, stride_h, stride_d,
+      o_stride_t, o_stride_h, o_stride_d);
   NVTE_CHECK_CUDA(cudaGetLastError());
 }
 
 template <typename scalar_t>
 void fused_rope_thd_backward_launcher(const scalar_t *output_grads, const int *cu_seqlens,
-                                      const float *freqs, scalar_t *input_grads, const int max_s,
-                                      const int b, const int h, const int d, const int d2,
-                                      const int stride_t, const int stride_h, const int stride_d,
-                                      const int o_stride_t, const int o_stride_h,
-                                      const int o_stride_d, cudaStream_t stream) {
+                                      const float *freqs, const int *start_positions,
+                                      scalar_t *input_grads, const int max_s, const int b,
+                                      const int h, const int d, const int d2, const int stride_t,
+                                      const int stride_h, const int stride_d, const int o_stride_t,
+                                      const int o_stride_h, const int o_stride_d,
+                                      cudaStream_t stream) {
   int warps_per_block = h < 16 ? 4 : 8;
   dim3 blocks(max_s, b);
   dim3 threads(THREADS_PER_WARP, warps_per_block);
 
   fused_rope_thd_backward_kernel<<<blocks, threads, 0, stream>>>(
-      output_grads, cu_seqlens, freqs, input_grads, h, d, d2, stride_t, stride_h, stride_d,
-      o_stride_t, o_stride_h, o_stride_d);
+      output_grads, cu_seqlens, freqs, start_positions, input_grads, h, d, d2, stride_t, stride_h,
+      stride_d, o_stride_t, o_stride_h, o_stride_d);
   NVTE_CHECK_CUDA(cudaGetLastError());
 }
 
-void fused_rope_forward(const Tensor &input, const Tensor &freqs, Tensor *output, const int s,
-                        const int b, const int h, const int d, const int d2, const int stride_s,
-                        const int stride_b, const int stride_h, const int stride_d,
-                        const int o_stride_s, const int o_stride_b, const int o_stride_h,
-                        const int o_stride_d, cudaStream_t stream) {
+void fused_rope_forward(const Tensor &input, const Tensor &freqs, const Tensor &start_positions,
+                        Tensor *output, const int s, const int b, const int h, const int d,
+                        const int d2, const int stride_s, const int stride_b, const int stride_h,
+                        const int stride_d, const int o_stride_s, const int o_stride_b,
+                        const int o_stride_h, const int o_stride_d, cudaStream_t stream) {
   TRANSFORMER_ENGINE_TYPE_SWITCH_INPUT(
       input.data.dtype, scalar_t,
       fused_rope_forward_launcher(reinterpret_cast<const scalar_t *>(input.data.dptr),
                                   reinterpret_cast<const float *>(freqs.data.dptr),
+                                  reinterpret_cast<const int *>(start_positions.data.dptr),
                                   reinterpret_cast<scalar_t *>(output->data.dptr), s, b, h, d, d2,
                                   stride_s, stride_b, stride_h, stride_d, o_stride_s, o_stride_b,
                                   o_stride_h, o_stride_d, stream););
 }
 
-void fused_rope_backward(const Tensor &output_grads, const Tensor &freqs, Tensor *input_grads,
-                         const int s, const int b, const int h, const int d, const int d2,
-                         const int stride_s, const int stride_b, const int stride_h,
-                         const int stride_d, const int o_stride_s, const int o_stride_b,
-                         const int o_stride_h, const int o_stride_d, cudaStream_t stream) {
+void fused_rope_backward(const Tensor &output_grads, const Tensor &freqs,
+                         const Tensor &start_positions, Tensor *input_grads, const int s,
+                         const int b, const int h, const int d, const int d2, const int stride_s,
+                         const int stride_b, const int stride_h, const int stride_d,
+                         const int o_stride_s, const int o_stride_b, const int o_stride_h,
+                         const int o_stride_d, cudaStream_t stream) {
   TRANSFORMER_ENGINE_TYPE_SWITCH_INPUT(
       output_grads.data.dtype, scalar_t,
       fused_rope_backward_launcher(reinterpret_cast<const scalar_t *>(output_grads.data.dptr),
                                    reinterpret_cast<const float *>(freqs.data.dptr),
+                                   reinterpret_cast<const int *>(start_positions.data.dptr),
                                    reinterpret_cast<scalar_t *>(input_grads->data.dptr), s, b, h, d,
                                    d2, stride_s, stride_b, stride_h, stride_d, o_stride_s,
                                    o_stride_b, o_stride_h, o_stride_d, stream););
 }
 
 void fused_rope_thd_forward(const Tensor &input, const Tensor &cu_seqlens, const Tensor &freqs,
-                            Tensor *output, const int max_s, const int b, const int h, const int d,
-                            const int d2, const int stride_t, const int stride_h,
-                            const int stride_d, const int o_stride_t, const int o_stride_h,
-                            const int o_stride_d, cudaStream_t stream) {
+                            const Tensor &start_positions, Tensor *output, const int max_s,
+                            const int b, const int h, const int d, const int d2, const int stride_t,
+                            const int stride_h, const int stride_d, const int o_stride_t,
+                            const int o_stride_h, const int o_stride_d, cudaStream_t stream) {
   TRANSFORMER_ENGINE_TYPE_SWITCH_INPUT(
       input.data.dtype, scalar_t,
       fused_rope_thd_forward_launcher(reinterpret_cast<const scalar_t *>(input.data.dptr),
                                       reinterpret_cast<const int *>(cu_seqlens.data.dptr),
                                       reinterpret_cast<const float *>(freqs.data.dptr),
+                                      reinterpret_cast<const int *>(start_positions.data.dptr),
                                       reinterpret_cast<scalar_t *>(output->data.dptr), max_s, b, h,
                                       d, d2, stride_t, stride_h, stride_d, o_stride_t, o_stride_h,
                                       o_stride_d, stream););
 }
 
 void fused_rope_thd_backward(const Tensor &output_grads, const Tensor &cu_seqlens,
-                             const Tensor &freqs, Tensor *input_grads, const int max_s, const int b,
-                             const int h, const int d, const int d2, const int stride_t,
-                             const int stride_h, const int stride_d, const int o_stride_t,
-                             const int o_stride_h, const int o_stride_d, cudaStream_t stream) {
+                             const Tensor &freqs, const Tensor &start_positions,
+                             Tensor *input_grads, const int max_s, const int b, const int h,
+                             const int d, const int d2, const int stride_t, const int stride_h,
+                             const int stride_d, const int o_stride_t, const int o_stride_h,
+                             const int o_stride_d, cudaStream_t stream) {
   TRANSFORMER_ENGINE_TYPE_SWITCH_INPUT(
       output_grads.data.dtype, scalar_t,
       fused_rope_thd_backward_launcher(reinterpret_cast<const scalar_t *>(output_grads.data.dptr),
                                        reinterpret_cast<const int *>(cu_seqlens.data.dptr),
                                        reinterpret_cast<const float *>(freqs.data.dptr),
+                                       reinterpret_cast<const int *>(start_positions.data.dptr),
                                        reinterpret_cast<scalar_t *>(input_grads->data.dptr), max_s,
                                        b, h, d, d2, stride_t, stride_h, stride_d, o_stride_t,
                                        o_stride_h, o_stride_d, stream););
@@ -274,58 +289,62 @@ void fused_rope_thd_backward(const Tensor &output_grads, const Tensor &cu_seqlen
 
 }  // end namespace transformer_engine
 
-void nvte_fused_rope_forward(const NVTETensor input, const NVTETensor freqs, NVTETensor output,
-                             const int s, const int b, const int h, const int d, const int d2,
+void nvte_fused_rope_forward(const NVTETensor input, const NVTETensor freqs,
+                             const NVTETensor start_positions, NVTETensor output, const int s,
+                             const int b, const int h, const int d, const int d2,
                              const int stride_s, const int stride_b, const int stride_h,
                              const int stride_d, const int o_stride_s, const int o_stride_b,
                              const int o_stride_h, const int o_stride_d, cudaStream_t stream) {
   NVTE_API_CALL(nvte_fused_rope_forward);
   using namespace transformer_engine;
   fused_rope_forward(*reinterpret_cast<const Tensor *>(input),
-                     *reinterpret_cast<const Tensor *>(freqs), reinterpret_cast<Tensor *>(output),
-                     s, b, h, d, d2, stride_s, stride_b, stride_h, stride_d, o_stride_s, o_stride_b,
-                     o_stride_h, o_stride_d, stream);
+                     *reinterpret_cast<const Tensor *>(freqs),
+                     *reinterpret_cast<const Tensor *>(start_positions),
+                     reinterpret_cast<Tensor *>(output), s, b, h, d, d2, stride_s, stride_b,
+                     stride_h, stride_d, o_stride_s, o_stride_b, o_stride_h, o_stride_d, stream);
 }
 
 void nvte_fused_rope_backward(const NVTETensor output_grads, const NVTETensor freqs,
-                              NVTETensor input_grads, const int s, const int b, const int h,
-                              const int d, const int d2, const int stride_s, const int stride_b,
-                              const int stride_h, const int stride_d, const int o_stride_s,
-                              const int o_stride_b, const int o_stride_h, const int o_stride_d,
-                              cudaStream_t stream) {
+                              const NVTETensor start_positions, NVTETensor input_grads, const int s,
+                              const int b, const int h, const int d, const int d2,
+                              const int stride_s, const int stride_b, const int stride_h,
+                              const int stride_d, const int o_stride_s, const int o_stride_b,
+                              const int o_stride_h, const int o_stride_d, cudaStream_t stream) {
   NVTE_API_CALL(nvte_fused_rope_backward);
   using namespace transformer_engine;
   fused_rope_backward(*reinterpret_cast<const Tensor *>(output_grads),
                       *reinterpret_cast<const Tensor *>(freqs),
+                      *reinterpret_cast<const Tensor *>(start_positions),
                       reinterpret_cast<Tensor *>(input_grads), s, b, h, d, d2, stride_s, stride_b,
                       stride_h, stride_d, o_stride_s, o_stride_b, o_stride_h, o_stride_d, stream);
 }
 
 void nvte_fused_rope_thd_forward(const NVTETensor input, const NVTETensor cu_seqlens,
-                                 const NVTETensor freqs, NVTETensor output, const int max_s,
-                                 const int b, const int h, const int d, const int d2,
-                                 const int stride_t, const int stride_h, const int stride_d,
-                                 const int o_stride_t, const int o_stride_h, const int o_stride_d,
-                                 cudaStream_t stream) {
+                                 const NVTETensor freqs, const NVTETensor start_positions,
+                                 NVTETensor output, const int max_s, const int b, const int h,
+                                 const int d, const int d2, const int stride_t, const int stride_h,
+                                 const int stride_d, const int o_stride_t, const int o_stride_h,
+                                 const int o_stride_d, cudaStream_t stream) {
   NVTE_API_CALL(nvte_fused_rope_thd_forward);
   using namespace transformer_engine;
   fused_rope_thd_forward(
       *reinterpret_cast<const Tensor *>(input), *reinterpret_cast<const Tensor *>(cu_seqlens),
-      *reinterpret_cast<const Tensor *>(freqs), reinterpret_cast<Tensor *>(output), max_s, b, h, d,
-      d2, stride_t, stride_h, stride_d, o_stride_t, o_stride_h, o_stride_d, stream);
+      *reinterpret_cast<const Tensor *>(freqs), *reinterpret_cast<const Tensor *>(start_positions),
+      reinterpret_cast<Tensor *>(output), max_s, b, h, d, d2, stride_t, stride_h, stride_d,
+      o_stride_t, o_stride_h, o_stride_d, stream);
 }
 
 void nvte_fused_rope_thd_backward(const NVTETensor output_grads, const NVTETensor cu_seqlens,
-                                  const NVTETensor freqs, NVTETensor input_grads, const int max_s,
-                                  const int b, const int h, const int d, const int d2,
-                                  const int stride_t, const int stride_h, const int stride_d,
-                                  const int o_stride_t, const int o_stride_h, const int o_stride_d,
-                                  cudaStream_t stream) {
+                                  const NVTETensor freqs, const NVTETensor start_positions,
+                                  NVTETensor input_grads, const int max_s, const int b, const int h,
+                                  const int d, const int d2, const int stride_t, const int stride_h,
+                                  const int stride_d, const int o_stride_t, const int o_stride_h,
+                                  const int o_stride_d, cudaStream_t stream) {
   NVTE_API_CALL(nvte_fused_rope_thd_backward);
   using namespace transformer_engine;
-  fused_rope_thd_backward(*reinterpret_cast<const Tensor *>(output_grads),
-                          *reinterpret_cast<const Tensor *>(cu_seqlens),
-                          *reinterpret_cast<const Tensor *>(freqs),
-                          reinterpret_cast<Tensor *>(input_grads), max_s, b, h, d, d2, stride_t,
-                          stride_h, stride_d, o_stride_t, o_stride_h, o_stride_d, stream);
+  fused_rope_thd_backward(
+      *reinterpret_cast<const Tensor *>(output_grads),
+      *reinterpret_cast<const Tensor *>(cu_seqlens), *reinterpret_cast<const Tensor *>(freqs),
+      *reinterpret_cast<const Tensor *>(start_positions), reinterpret_cast<Tensor *>(input_grads),
+      max_s, b, h, d, d2, stride_t, stride_h, stride_d, o_stride_t, o_stride_h, o_stride_d, stream);
 }
diff --git a/transformer_engine/common/include/transformer_engine/fused_rope.h b/transformer_engine/common/include/transformer_engine/fused_rope.h
index b92de88eca..01305c1e6d 100644
--- a/transformer_engine/common/include/transformer_engine/fused_rope.h
+++ b/transformer_engine/common/include/transformer_engine/fused_rope.h
@@ -17,6 +17,7 @@ extern "C" {
  *
  *  \param[in]     input           Input tensor for fused rope.
  *  \param[in]     freqs           The freqs tensor.
+ *  \param[in]     start_positions          The beginning offsets.
  *  \param[out]    output          Output tensor.
  *  \param[in]     s               Length of the s dimension of input.
  *  \param[in]     b               Length of the b dimension of input.
@@ -33,8 +34,9 @@ extern "C" {
  *  \param[in]     o_stride_d      Stride of the d dimension of output.
  *  \param[in]     stream          CUDA stream used for the operation.
  */
-void nvte_fused_rope_forward(const NVTETensor input, const NVTETensor freqs, NVTETensor output,
-                             const int s, const int b, const int h, const int d, const int d2,
+void nvte_fused_rope_forward(const NVTETensor input, const NVTETensor freqs,
+                             const NVTETensor start_positions, NVTETensor output, const int s,
+                             const int b, const int h, const int d, const int d2,
                              const int stride_s, const int stride_b, const int stride_h,
                              const int stride_d, const int o_stride_s, const int o_stride_b,
                              const int o_stride_h, const int o_stride_d, cudaStream_t stream);
@@ -43,6 +45,7 @@ void nvte_fused_rope_forward(const NVTETensor input, const NVTETensor freqs, NVT
  *
  *  \param[in]     output_grads    Incoming gradient tensor for backward.
  *  \param[in]     freqs           The freqs tensor.
+ *  \param[in]     start_positions The tensor with positions of first tokens in sequences.
  *  \param[out]    input_grads     Input gradient tensor to calculate.
  *  \param[in]     s               Length of the s dimension of output_grads.
  *  \param[in]     b               Length of the b dimension of output_grads.
@@ -60,43 +63,45 @@ void nvte_fused_rope_forward(const NVTETensor input, const NVTETensor freqs, NVT
  *  \param[in]     stream          CUDA stream used for the operation.
  */
 void nvte_fused_rope_backward(const NVTETensor output_grads, const NVTETensor freqs,
-                              NVTETensor input_grads, const int s, const int b, const int h,
-                              const int d, const int d2, const int stride_s, const int stride_b,
-                              const int stride_h, const int stride_d, const int o_stride_s,
-                              const int o_stride_b, const int o_stride_h, const int o_stride_d,
-                              cudaStream_t stream);
+                              const NVTETensor start_positions, NVTETensor input_grads, const int s,
+                              const int b, const int h, const int d, const int d2,
+                              const int stride_s, const int stride_b, const int stride_h,
+                              const int stride_d, const int o_stride_s, const int o_stride_b,
+                              const int o_stride_h, const int o_stride_d, cudaStream_t stream);
 
 /*! \brief Apply rotary positional embedding to the input tensor in thd format.
  *
- *  \param[in]     input         Input tensor for fused rope.
- *  \param[in]     cu_seqlens    The cumulative sum of sequence lengths tensor.
- *  \param[in]     freqs         The freqs tensor.
- *  \param[out]    output        Output tensor.
- *  \param[in]     max_s         Max sequence length.
- *  \param[in]     b             Batch size.
- *  \param[in]     h             Length of the h dimension of input.
- *  \param[in]     d             Length of the d dimension of input.
- *  \param[in]     d2            Length of the d dimension of freqs.
- *  \param[in]     stride_t      Stride of the t dimension of input.
- *  \param[in]     stride_h      Stride of the h dimension of input.
- *  \param[in]     stride_d      Stride of the d dimension of input.
- *  \param[in]     o_stride_t    Stride of the t dimension of output.
- *  \param[in]     o_stride_h    Stride of the h dimension of output.
- *  \param[in]     o_stride_d    Stride of the d dimension of output.
- *  \param[in]     stream        CUDA stream used for the operation.
+ *  \param[in]     input           Input tensor for fused rope.
+ *  \param[in]     cu_seqlens      The cumulative sum of sequence lengths tensor.
+ *  \param[in]     freqs           The freqs tensor.
+ *  \param[in]     start_positions The tensor with positions of first tokens in sequences.
+ *  \param[out]    output          Output tensor.
+ *  \param[in]     max_s           Max sequence length.
+ *  \param[in]     b               Batch size.
+ *  \param[in]     h               Length of the h dimension of input.
+ *  \param[in]     d               Length of the d dimension of input.
+ *  \param[in]     d2              Length of the d dimension of freqs.
+ *  \param[in]     stride_t        Stride of the t dimension of input.
+ *  \param[in]     stride_h        Stride of the h dimension of input.
+ *  \param[in]     stride_d        Stride of the d dimension of input.
+ *  \param[in]     o_stride_t      Stride of the t dimension of output.
+ *  \param[in]     o_stride_h      Stride of the h dimension of output.
+ *  \param[in]     o_stride_d      Stride of the d dimension of output.
+ *  \param[in]     stream          CUDA stream used for the operation.
  */
 void nvte_fused_rope_thd_forward(const NVTETensor input, const NVTETensor cu_seqlens,
-                                 const NVTETensor freqs, NVTETensor output, const int max_s,
-                                 const int b, const int h, const int d, const int d2,
-                                 const int stride_t, const int stride_h, const int stride_d,
-                                 const int o_stride_t, const int o_stride_h, const int o_stride_d,
-                                 cudaStream_t stream);
+                                 const NVTETensor freqs, NVTETensor start_positions,
+                                 NVTETensor output, const int max_s, const int b, const int h,
+                                 const int d, const int d2, const int stride_t, const int stride_h,
+                                 const int stride_d, const int o_stride_t, const int o_stride_h,
+                                 const int o_stride_d, cudaStream_t stream);
 
 /*! \brief Compute the backward of the fused rope in thd format.
  *
  *  \param[in]     output_grads  Incoming gradient tensor for backward.
  *  \param[in]     cu_seqlens    The cumulative sum of sequence lengths tensor.
  *  \param[in]     freqs         The freqs tensor.
+ *  \param[in]     start_positions          The beginning offsets.
  *  \param[out]    input_grads   Input gradient to calculate.
  *  \param[in]     max_s         Max sequence length.
  *  \param[in]     b             Batch size.
@@ -112,11 +117,11 @@ void nvte_fused_rope_thd_forward(const NVTETensor input, const NVTETensor cu_seq
  *  \param[in]     stream        CUDA stream used for the operation.
  */
 void nvte_fused_rope_thd_backward(const NVTETensor output_grads, const NVTETensor cu_seqlens,
-                                  const NVTETensor freqs, NVTETensor input_grads, const int max_s,
-                                  const int b, const int h, const int d, const int d2,
-                                  const int stride_t, const int stride_h, const int stride_d,
-                                  const int o_stride_t, const int o_stride_h, const int o_stride_d,
-                                  cudaStream_t stream);
+                                  const NVTETensor freqs, NVTETensor start_positions,
+                                  NVTETensor input_grads, const int max_s, const int b, const int h,
+                                  const int d, const int d2, const int stride_t, const int stride_h,
+                                  const int stride_d, const int o_stride_t, const int o_stride_h,
+                                  const int o_stride_d, cudaStream_t stream);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py
index fa72ecfa33..7430027335 100644
--- a/transformer_engine/pytorch/attention.py
+++ b/transformer_engine/pytorch/attention.py
@@ -703,18 +703,43 @@ class InferenceParams:  # pylint: disable=too-few-public-methods
 
     Parameters
     ----------
-    max_batch_size : int
+    max_batch_size: int
                     maximum batch size during inference.
-    max_sequence_length : int
-                         maximum sequence length during inference.
+    max_sequence_length: int
+                    maximum sequence length during inference.
+    qkv_format: str
+                    Dimension format for `q`, `k` and `v`, {`sbhd`, `bshd`, `thd`}.
+                    `s` stands for the sequence length dimension,
+                    `b` batch size, `h` the number of attention heads,
+                    `d` head size, and `t` the total number of sequences in a batch, i.e.
+                    `t = sum(s_i) for i = 0...b-1`.
     """
 
-    def __init__(self, max_batch_size, max_sequence_length):
+    def __init__(self, max_batch_size, max_sequence_length, qkv_format="bshd"):
+        assert qkv_format in ["bshd", "sbhd", "thd"]
+
         self.max_sequence_length = max_sequence_length
         self.max_batch_size = max_batch_size
-        self.sequence_len_offset = 0
-        self.batch_size_offset = 0
+
+        # self.key_value_memory_dict[layer number] = (key_cache, value_cache)
+        # if qkv_format in ["bshd", "sbhd"]: (key/value)_cache.shape = [b/s, s/b, h, d]
+        # # if qkv_format = "thd":  (key/value)_cache.shape = [t, h, d]
         self.key_value_memory_dict = {}
+        self.qkv_format = qkv_format
+
+        if qkv_format == "thd":
+            # In thd attention layout input sequences can have different lenghts.
+            # self.input_sequence_lengths stores tensor of shape [b] with lengths of input sequences
+            # and self.cached_sequence_lengths is the sum of all previous input lengths tensors -
+            # equivalently it contains total lengths of cached sequences.
+            self.cached_sequence_lengths = torch.zeros(
+                (max_batch_size,), device="cuda", dtype=torch.int32)
+            self.input_sequence_lengths = torch.zeros(
+                (max_batch_size,), device="cuda", dtype=torch.int32)
+        else:
+            self.sequence_len_offset = 0
+            self.batch_size_offset = 0
+            self.input_sequence_length = None
 
     def swap_key_value_dict(self, batch_indices):
         """
@@ -742,6 +767,214 @@ def swap_key_value_dict(self, batch_indices):
             )
 
 
+    def setup_before_new_input(self, lengths_tensor=None, max_input_length=None, length=None):
+        """
+            Updates parameters representing incoming sequence lengths and lengths
+            of sequences in the cache. Should be called before every forward pass in the inference.
+
+            Parameters
+            ----------
+            lengths_tensor: torch.Tensor
+                1d tensor with sequence lengths in new input.
+                Should be used only when self.qkv_format = "thd".
+            max_input_length: int
+                Should be used only when self.qkv_format = "thd".
+                If the incoming sequences tensor has shape [b * s, h, d],
+                this should be equal to s.
+            length: int
+                Length of the incoming sequences.
+                Should be used only when self.qkv_format in ["bshd", "sbhd"].
+        """
+        if self.qkv_format == "thd":
+            assert lengths_tensor is not None and max_input_length is not None, \
+                "lengths_tensor and max_input_length should not be none for qkv_format = \"thd\""
+            torch.add(
+                self.cached_sequence_lengths,
+                self.input_sequence_lengths,
+                out=self.cached_sequence_lengths)
+            self.input_sequence_lengths.copy_(lengths_tensor)
+            self.max_incoming_seq_len = max_input_length
+
+        else:
+            assert length is not None, \
+                "length should not be none for qkv_format in [\"bshd\", \"sbhd\"]"
+            if self.input_sequence_length is not None:
+                self.sequence_len_offset += self.input_sequence_length
+            self.input_sequence_length = length
+
+    def reset(self):
+        """
+            Resets the parameters to allow the use of this object in a new generation iteration.
+            This method does not reallocate buffers,
+            making it more efficient than creating a new InferenceParams object.
+            Moreover, reusing the same object with the same buffers is compatible
+            with the CUDA Graphs.
+        """
+        if self.qkv_format == "thd":
+            self.cached_sequence_lengths.zero_()
+            self.input_sequence_lengths.zero_()
+        else:
+            self.input_sequence_length = None
+            self.sequence_len_offset = 0
+
+    def save_to_kv_cache(self, layer_number, key_layer, value_layer):
+        """
+            Saves key_layer and value_layer in the cache.
+
+            Parameters
+            ----------
+            layer_number: input
+                layer number of the current `TransformerLayer` when multiple such modules are
+                 concatenated to form a transformer block.
+            key_layer: torch.Tensor
+                Tensor - of the format corresponding to the self.qkv_format -
+                representing key_layer.
+                Notice: if self.qkv_format in ["bshd", "sbhd"] then both layers are in format sbhd
+                Notice: if self.qkv_format = "thd", we assume that offsets of the sequences
+                        are of the form k * self.max_incoming_seq_len for k = 0, ..., batch_size-1.
+            value_layer: int
+                Tensor - of the format corresponding to the self.qkv_format -
+                representing value_layer.
+                Notice: if self.qkv_format in ["bshd", "sbhd"] both layers are in format sbhd
+                Notice: if self.qkv_format = "thd", we assume that offsets of the sequences
+                        are of the form k * self.max_incoming_seq_len for k = 0, ..., batch_size-1.
+        """
+        # Current kernels work only with contiguous tensors, it can be made faster in the future.
+        key_layer, value_layer = key_layer.contiguous(), value_layer.contiguous()
+        inference_key_memory, inference_value_memory = self.key_value_memory_dict[layer_number]
+        if self.qkv_format == "thd":
+            channels = inference_key_memory.shape[1] * inference_key_memory.shape[2] # h * d
+            # This kernels copies kernels from input layers into cache,
+            # taking into account the thd format and sequence lengths.
+            tex.attention_copy(
+                inference_key_memory,
+                self.cached_sequence_lengths,
+                self.input_sequence_lengths,
+                key_layer,
+                self.max_incoming_seq_len,
+                self.max_sequence_length,
+                self.max_batch_size,
+                channels)
+
+            tex.attention_copy(
+                inference_value_memory,
+                self.cached_sequence_lengths,
+                self.input_sequence_lengths,
+                value_layer,
+                self.max_incoming_seq_len,
+                self.max_sequence_length,
+                self.max_batch_size,
+                channels)
+            key_layer, value_layer = inference_key_memory, inference_value_memory
+        else:
+            assert self.qkv_format in ["bshd", "sbhd"], \
+                "Attention format not supported by the inference."
+            batch_start = self.batch_size_offset
+            batch_end = batch_start + key_layer.size(1)
+            assert batch_end <= inference_key_memory.size(1)
+
+            sequence_start = self.sequence_len_offset
+            sequence_end = sequence_start + key_layer.size(0)
+            assert sequence_end <= inference_key_memory.size(0)
+
+            # Copy keys and values into KV-cache
+            seq_offsets = slice(sequence_start, sequence_end)
+            batch_offsets = slice(batch_start, batch_end)
+            inference_key_memory[seq_offsets, batch_offsets, ...] = key_layer
+            inference_value_memory[seq_offsets, batch_offsets, ...] = value_layer
+            key_layer = inference_key_memory[:sequence_end, batch_offsets, ...]
+            value_layer = inference_value_memory[:sequence_end, batch_offsets, ...]
+        return key_layer, value_layer
+
+    def allocate_memory_for_kv_cache_if_empty(
+            self,
+            layer_number,
+            num_gqa_groups_per_partition,
+            hidden_size_per_attention_head,
+            dtype):
+        """
+            Allocates memory for kv_cache for given layer, if it hasn't been alocated before.
+
+            Parameters
+            ----------
+            layer_number: input
+                layer number of the current `TransformerLayer` when multiple such modules are
+                 concatenated to form a transformer block.
+            num_gqa_groups_per_partition: torch.Tensor
+                This will be third dimension of cache tensor.
+            hidden_size_per_attention_head: int
+                This will be fourth dimension of cache tensor.
+        """
+
+        if layer_number in self.key_value_memory_dict:
+            return # Already allocated
+
+        b, s = self.max_batch_size, self.max_sequence_length
+
+        def _allocate_memory(dims):
+            return torch.zeros(
+                *dims,
+                num_gqa_groups_per_partition,
+                hidden_size_per_attention_head,
+                dtype=dtype,
+                device=torch.cuda.current_device(),
+            )
+
+        if self.qkv_format == "thd":
+            inference_key_memory = _allocate_memory((b * s,))
+            inference_value_memory = _allocate_memory((b * s,))
+        else:
+            inference_key_memory = _allocate_memory((s, b))
+            inference_value_memory = _allocate_memory((s, b))
+        self.key_value_memory_dict[layer_number] = (
+            inference_key_memory,
+            inference_value_memory,
+        )
+
+    def set_params_to_thd_attention(self, buffers, channels):
+        """
+            Fused attention with q/k/v of thd layout with offsets needs some parameters informing
+            about sequence lengths. This function computes them and
+            saves them into the provided buffers.
+
+            Parameters
+            ----------
+            buffers: List[torch.Tensor]
+                buffers of size [batch_size + 1] for the parameters:
+                cu_seqlens_q, cu_seqlens_kv, seq_offsets_q,
+                seq_offsets_k, seq_offsets_v, seq_offsets_o
+                respectively.
+            channels: int
+                value of num_heads * hidden_dim_for_each_head.
+
+            Returns
+            ----------
+            max_seqlen_q: int
+                Maximal value of query sequence length.
+            max_seqlen_kv: int
+                Maximal value of key/value sequence length.
+            buffers: torch.Tensor
+                Tensor with filled buffers.
+        """
+        max_seqlen_q, max_seqlen_kv = self.max_incoming_seq_len, self.max_sequence_length
+
+        cu_seqlens_q, cu_seqlens_kv, seq_offsets_q, seq_offsets_k, seq_offsets_v, seq_offsets_o = \
+            buffers
+
+        torch.cumsum(self.input_sequence_lengths, dim=0, out=cu_seqlens_q[1:])
+        torch.cumsum(
+            self.cached_sequence_lengths + self.input_sequence_lengths,
+            dim=0, out=cu_seqlens_kv[1:])
+        # If layer has shape [b * s_layer, h, d]
+        # offsets are of the form [k * s_layer * h * d for k = 0, ..., batch_size]
+        seq_offsets_q.copy_(
+            torch.arange(0, self.max_batch_size + 1, device="cuda") * channels * max_seqlen_q)
+        seq_offsets_k.copy_(
+            torch.arange(0, self.max_batch_size + 1, device="cuda") * channels * max_seqlen_kv)
+        seq_offsets_v.copy_(seq_offsets_k)
+        seq_offsets_o.copy_(seq_offsets_q)
+
+        return max_seqlen_q, max_seqlen_kv, buffers
 @torch.no_grad()
 def get_swa_mask(
     window_size: Tuple[int, int],
@@ -2460,33 +2693,44 @@ def forward(
         freqs: torch.Tensor,
         tensor_format: str = "sbhd",
         cu_seqlens: Union[torch.Tensor, None] = None,
+        beginning_offsets: Union[torch.Tensor, None] = None,
     ) -> torch.Tensor:
+        if beginning_offsets is None:
+            # Each sequence will start from positional encoding corresponding to 0.
+            # Otherwise sequence i will start from positional encoding
+            # corresponding to beginning_offsets[i].
+            beginning_offsets = torch.Tensor()
         if freqs.dtype != torch.float32:
             freqs = freqs.float()
         if tensor_format == "sbhd":
-            output = tex.fused_rope_forward(t, freqs, False)
+            output = tex.fused_rope_forward(t, freqs, beginning_offsets, False)
         elif tensor_format == "bshd":
-            output = tex.fused_rope_forward(t.transpose(0, 1), freqs, True).transpose(0, 1)
+            output = tex.fused_rope_forward(
+                t.transpose(0, 1), freqs, beginning_offsets, True
+            ).transpose(0, 1)
         elif tensor_format == "thd":
-            output = tex.fused_rope_thd_forward(t, cu_seqlens, freqs)
+            output = tex.fused_rope_thd_forward(t, cu_seqlens, freqs, beginning_offsets)
         else:
             raise ValueError(f"Unsupported tensor_format: {tensor_format}.")
-        ctx.save_for_backward(freqs, cu_seqlens)
+        ctx.save_for_backward(freqs, cu_seqlens, beginning_offsets)
         ctx.tensor_format = tensor_format
 
         return output
 
     @staticmethod
-    def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None], ...]:
-        freqs, cu_seqlens = ctx.saved_tensors
+    def backward(
+        ctx, grad_output: torch.Tensor
+    ) -> Tuple[Union[torch.Tensor, None], ...]:
+        freqs, cu_seqlens, start_positions = ctx.saved_tensors
         if ctx.tensor_format == "sbhd":
-            grad_input = tex.fused_rope_backward(grad_output, freqs, False)
+            grad_input = tex.fused_rope_backward(grad_output, freqs, start_positions, False)
         elif ctx.tensor_format == "bshd":
             grad_input = tex.fused_rope_backward(
-                grad_output.transpose(0, 1), freqs, True
+                grad_output.transpose(0, 1), freqs, start_positions, True
             ).transpose(0, 1)
         elif ctx.tensor_format == "thd":
-            grad_input = tex.fused_rope_thd_backward(grad_output, cu_seqlens, freqs)
+            grad_input = tex.fused_rope_thd_backward(
+                grad_output, cu_seqlens, freqs, start_positions)
         else:
             raise ValueError(f"Unsupported tensor_format: {ctx.tensor_format}.")
 
@@ -2508,6 +2752,7 @@ def apply_rotary_pos_emb(
     tensor_format: str = "sbhd",
     fused: bool = False,
     cu_seqlens: Union[torch.Tensor, None] = None,
+    start_positions: Union[torch.Tensor, None] = None,
 ) -> torch.Tensor:
     """
     Apply rotary positional embedding tensor to the input tensor.
@@ -2528,12 +2773,18 @@ def apply_rotary_pos_emb(
     cu_seqlens: torch.Tensor, default = None.
         Cumulative sum of sequence lengths in a batch for `t`, with shape [b + 1] and
         dtype torch.int32. Only valid when `tensor_format` is 'thd'.
+    start_positions: torch.Tensor, default = None.
+        Token i from sequence s have position encoding corresponding to
+        position start_positions[i]. If start_positions=None, then this token has position i.
     """
+    assert not (start_positions is not None and not fused), \
+        """start_positions != None and fused=False is not supported"""
+
     if fused:
         assert (
             tensor_format != "thd" or cu_seqlens is not None
         ), "cu_seqlens must not be None when tensor_format is 'thd'."
-        return FusedRoPEFunc.apply(t, freqs, tensor_format, cu_seqlens)
+        return FusedRoPEFunc.apply(t, freqs, tensor_format, cu_seqlens, start_positions)
 
     assert tensor_format in ("sbhd", "bshd"), (
         "Only formats `sbhd` or `bshd` are supported for input tensor `t` "
@@ -5121,6 +5372,7 @@ def __init__(
         self.cp_group = cp_group
         self.cp_global_ranks = cp_global_ranks
         self.cp_stream = cp_stream
+        self.channels = kv_channels * num_attention_heads
 
         self.hidden_size_per_attention_head = kv_channels
 
@@ -5210,6 +5462,16 @@ def remove_extra_states_check(self, incompatible_keys):  # pylint: disable=unuse
 
         self.register_load_state_dict_post_hook(remove_extra_states_check)
 
+        self._allocator = StaticBufferAllocator()
+
+
+    def alloc(self, size, dtype, device):
+        """
+            Allocated the buffer and works correctly with CUDA Graphs.
+        """
+        return self._allocator(size, dtype, device)
+
+
     def _checkpointed_attention_forward(
         self,
         attention_func: Callable,
@@ -5413,21 +5675,7 @@ def forward(
                                first microbatch (since it is the first gradient being
                                produced)
         """
-        with self.prepare_forward(
-            query_layer,
-            is_first_microbatch,
-            num_gemms=3,
-            allow_non_contiguous=True,
-        ) as query_layer:
-
-            if self.fp8:
-                if self.fp8_meta["recipe"].fp8_mha:
-                    if not self.fp8_meta["recipe"].fp8_dpa:
-                        self.fp8_meta["recipe"].fp8_dpa = True
-                        self.logger.WARNING(
-                            """Forcing fp8_meta["recipe"].fp8_dpa=True due to """
-                            """fp8_meta["recipe"].fp8_mha=True"""
-                        )
+        batch_size = key_layer.shape[0]
 
             if self.fp8 and self.fp8_meta["recipe"].fp8_dpa:
                 forward_dtype = get_fp8_te_dtype(self.fp8_meta["recipe"], fprop_tensor=True)
@@ -5484,28 +5732,26 @@ def forward(
                     key_layer = key_layer.transpose(0, 1)
                     value_layer = value_layer.transpose(0, 1)
 
-                (
-                    inference_key_memory,
-                    inference_value_memory,
-                ) = inference_params.key_value_memory_dict[self.layer_number]
+            key_layer, value_layer = inference_params.save_to_kv_cache(
+                self.layer_number, key_layer, value_layer
+            )
 
-                batch_start = inference_params.batch_size_offset
-                batch_end = batch_start + key_layer.size(1)
-                assert batch_end <= inference_key_memory.size(1)
+            if qkv_format == "thd":
+                # Allocation of buffers, it works correctly with CUDA Graphs.
+                NR_BUFFERS = 6
+                buffers = [
+                    self.alloc(batch_size + 1, dtype=torch.int32, device="cuda")
+                    for _ in range(NR_BUFFERS)
+                ]
 
-                sequence_start = inference_params.sequence_len_offset
-                sequence_end = sequence_start + key_layer.size(0)
-                assert sequence_end <= inference_key_memory.size(0)
+                max_seqlen_q, max_seqlen_kv, buffers = \
+                    inference_params.set_params_to_thd_attention(buffers, self.channels)
+                cu_seqlens_q, cu_seqlens_kv, seq_offsets_q, \
+                    seq_offsets_k, seq_offsets_v, seq_offsets_o = buffers
 
-                # Copy keys and values into KV-cache
-                inference_key_memory[sequence_start:sequence_end, batch_start:batch_end, ...] = (
-                    key_layer
-                )
-                inference_value_memory[sequence_start:sequence_end, batch_start:batch_end, ...] = (
-                    value_layer
-                )
-                key_layer = inference_key_memory[:sequence_end, batch_start:batch_end, ...]
-                value_layer = inference_value_memory[:sequence_end, batch_start:batch_end, ...]
+                # query_layer is reshaped to the format [t, h, d]
+                # and make contiguous - needed by the THD attention
+                query_layer = query_layer.view(-1, *query_layer.shape[2:]).contiguous()
 
                 if qkv_format == "bshd":
                     key_layer = key_layer.transpose(0, 1)
@@ -5618,18 +5864,55 @@ def forward(
                 assert (
                     core_attention_bias is None
                 ), "core_attention_bias must be None when core_attention_bias_type is alibi!"
-                if (
-                    _alibi_cache["_num_heads"] != query_layer.shape[-2]
-                    or _alibi_cache["_max_seqlen_q"] != max_seqlen_q
-                    or _alibi_cache["_max_seqlen_kv"] != max_seqlen_kv
-                    or _alibi_cache["_bottom_right_alignment"] != bottom_right_alignment
-                    or _alibi_cache["_alibi_slopes"] is None
-                ):
-                    _alibi_cache["_alibi_slopes_require_update"] = True
-                    _alibi_cache["_alibi_bias_require_update"] = True
+            if (_alibi_cache["_num_heads"] != query_layer.shape[-2]
+                or _alibi_cache["_max_seqlen_q"] != max_seqlen_q
+                or _alibi_cache["_max_seqlen_kv"] != max_seqlen_kv
+                or _alibi_cache["_alibi_slopes"] is None):
+                _alibi_cache["_alibi_slopes_require_update"] = True
+                _alibi_cache["_alibi_bias_require_update"] = True
+
+        if core_attention_bias_type not in ["no_bias", "alibi"] or core_attention_bias is not None:
+            use_flash_attention = False
 
-            context_parallel = (
-                self.cp_group is not None and get_distributed_world_size(self.cp_group) != 1
+        fu_core_attention_bias_type = core_attention_bias_type
+        fu_core_attention_bias = core_attention_bias
+        if core_attention_bias_type == "alibi" and use_fused_attention and alibi_slopes is not None:
+            fu_core_attention_bias_type = "post_scale_bias"
+            _, fu_core_attention_bias = get_alibi(
+                query_layer.shape[-2], max_seqlen_q, max_seqlen_kv, alibi_slopes=alibi_slopes,
+                bias_dtype=query_layer.dtype)
+        if (use_fused_attention
+            and fu_core_attention_bias_type == "post_scale_bias"
+            and (fu_core_attention_bias.shape[0] != 1
+            or fu_core_attention_bias.shape[1] != query_layer.shape[-2])):
+            if fu_core_attention_bias.requires_grad:
+                # remove this line when cuDNN adds bwd support for
+                # [1, 1, s, s], [b, 1, s, s] and [b, h, s, s]
+                use_fused_attention = False
+            else:
+                # max512 backend will only support [1, h, s, s]
+                os.environ["NVTE_FUSED_ATTN_BACKEND"] = "1"
+
+        if query_layer.shape[-1] == 256 and query_layer.requires_grad:
+            # Fused attention is not supported for backward with head_dim = 256.
+            # to do (cyang): move it to the tex.get_fused_attn_backend
+            use_fused_attention = False
+
+        if use_fused_attention:
+            fused_attention_backend = tex.get_fused_attn_backend(
+                TE_DType[query_layer.dtype]
+                if not isinstance(query_layer, Float8Tensor) else query_layer._fp8_dtype,
+                TE_DType[key_layer.dtype]
+                if not isinstance(key_layer, Float8Tensor) else key_layer._fp8_dtype,
+                QKVLayout[qkv_layout],
+                AttnBiasType[fu_core_attention_bias_type],
+                AttnMaskType[attn_mask_type],
+                self.attention_dropout,
+                query_layer.shape[-2], # num_attn_heads
+                key_layer.shape[-2], # num_gqa_groups
+                max_seqlen_q,
+                max_seqlen_kv,
+                query_layer.shape[-1], # head_dim
             )
 
             core_attention_bias_shape = None
@@ -5663,87 +5946,33 @@ def forward(
                 and not torch.equal(cu_seqlens_kv_padded, cu_seqlens_kv)
             )
 
-            attention_params = AttentionParams(
-                qkv_type=type(query_layer),
-                qkv_dtype=query_layer.dtype,
-                qkv_layout=qkv_layout,
-                batch_size=batch_size,
-                num_heads=query_layer.shape[-2],
-                num_gqa_groups=key_layer.shape[-2],
-                max_seqlen_q=max_seqlen_q,
-                max_seqlen_kv=max_seqlen_kv,
-                head_dim=query_layer.shape[-1],
-                attn_mask_type=attn_mask_type,
-                window_size=window_size,
-                alibi_slopes_shape=alibi_slopes.shape if alibi_slopes is not None else None,
-                core_attention_bias_type=core_attention_bias_type,
-                core_attention_bias_shape=core_attention_bias_shape,
-                core_attention_bias_requires_grad=(
-                    core_attention_bias.requires_grad if core_attention_bias is not None else False
-                ),
-                pad_between_seqs=pad_between_seqs,
-                attention_dropout=self.attention_dropout,
-                context_parallel=context_parallel,
-                deterministic=self.deterministic,
-                is_training=self.training,
-                fp8=self.fp8,
-                fp8_meta=self.fp8_meta,
-            )
-            global _attention_backends
-            if (
-                _attention_backends["attention_params"] is None
-                or attention_params != _attention_backends["attention_params"]
-            ):
-                _attention_backends["attention_params"] = attention_params
-                _attention_backends["backend_selection_requires_update"] = True
-            if _attention_backends["backend_selection_requires_update"]:
-                (
-                    use_flash_attention,
-                    use_fused_attention,
-                    fused_attention_backend,
-                    use_unfused_attention,
-                    _,
-                ) = get_attention_backend(attention_params)
-                if use_flash_attention:
-                    self.logger.info("Running with FlashAttention backend")
-                elif use_fused_attention:
-                    self.logger.info(
-                        "Running with FusedAttention backend (sub-backend %s)",
-                        int(fused_attention_backend),
-                    )
-                elif use_unfused_attention:
-                    self.logger.info("Running with UnfusedDotProductAttention backend")
-            else:
-                use_flash_attention = _attention_backends["use_flash_attention"]
-                use_fused_attention = _attention_backends["use_fused_attention"]
-                fused_attention_backend = _attention_backends["fused_attention_backend"]
-                use_unfused_attention = _attention_backends["use_unfused_attention"]
-
-            if use_flash_attention:
-                if core_attention_bias_type == "alibi":
-                    alibi_slopes, _ = get_alibi(
-                        query_layer.shape[-2],
-                        max_seqlen_q,
-                        max_seqlen_kv,
-                        alibi_slopes=alibi_slopes,
-                    )
-                return self.flash_attention(
-                    query_layer,
-                    key_layer,
-                    value_layer,
-                    attention_mask=attention_mask,
-                    qkv_layout=qkv_layout,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_kv=cu_seqlens_kv,
-                    attn_mask_type=attn_mask_type,
-                    window_size=window_size,
-                    alibi_slopes=alibi_slopes,
-                    cp_group=self.cp_group,
-                    cp_global_ranks=self.cp_global_ranks,
-                    cp_stream=self.cp_stream,
-                    max_seqlen_q=max_seqlen_q,
-                    max_seqlen_kv=max_seqlen_kv,
-                )
+        if self.attention_type == "self":
+            if self.qkv_format == "bshd" and query_layer.shape[1] != value_layer.shape[1] or \
+            self.qkv_format == "sbhd" and query_layer.shape[0] != value_layer.shape[0]:
+                # Flash attention does not self-support max_seqlen_q != max_seqlen_kv
+                use_flash_attention = False
+
+        if use_flash_attention:
+            if _NVTE_DEBUG:
+                print("[DotProductAttention]: using flash-attn",_flash_attn_version)
+            if core_attention_bias_type == "alibi":
+                alibi_slopes, _ = get_alibi(
+                    query_layer.shape[-2], max_seqlen_q, max_seqlen_kv, alibi_slopes=alibi_slopes)
+            return self.flash_attention(query_layer,
+                                        key_layer,
+                                        value_layer,
+                                        attention_mask=attention_mask,
+                                        qkv_layout=qkv_layout,
+                                        cu_seqlens_q=cu_seqlens_q,
+                                        cu_seqlens_kv=cu_seqlens_kv,
+                                        attn_mask_type=attn_mask_type,
+                                        window_size=window_size,
+                                        alibi_slopes=alibi_slopes,
+                                        cp_group=self.cp_group,
+                                        cp_global_ranks=self.cp_global_ranks,
+                                        cp_stream=self.cp_stream,
+                                        max_seqlen_q=max_seqlen_q,
+                                        max_seqlen_kv=max_seqlen_kv)
 
             if use_fused_attention:
                 fu_core_attention_bias_type = core_attention_bias_type
@@ -5845,15 +6074,26 @@ def forward(
                     query_layer,
                     key_layer,
                     value_layer,
-                    qkv_layout=qkv_layout,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_kv=cu_seqlens_kv,
-                    attn_mask_type=attn_mask_type,
-                    attention_mask=attention_mask,
-                    core_attention_bias_type=core_attention_bias_type,
-                    core_attention_bias=core_attention_bias,
-                    alibi_slopes=alibi_slopes,
-                )
+                    qkv_layout = qkv_layout,
+                    cu_seqlens_q = cu_seqlens_q,
+                    cu_seqlens_kv = cu_seqlens_kv,
+                    attn_mask_type = attn_mask_type,
+                    attention_mask = attention_mask,
+                    core_attention_bias_type = core_attention_bias_type,
+                    core_attention_bias = core_attention_bias,
+                    alibi_slopes = alibi_slopes)
+
+            return self.unfused_attention(query_layer,
+                    key_layer,
+                    value_layer,
+                    qkv_layout = qkv_layout,
+                    cu_seqlens_q = cu_seqlens_q,
+                    cu_seqlens_kv = cu_seqlens_kv,
+                    attn_mask_type = attn_mask_type,
+                    attention_mask = attention_mask,
+                    core_attention_bias_type = core_attention_bias_type,
+                    core_attention_bias = core_attention_bias,
+                    alibi_slopes = alibi_slopes)
 
             raise Exception("No dot product attention support for the provided inputs!")
 
@@ -6206,17 +6446,13 @@ def __init__(
             **common_gemm_kwargs,
         )
 
-    def _allocate_memory(
-        self, inference_max_sequence_len: int, batch_size: int, dtype: torch.dtype
-    ) -> torch.Tensor:
-        return torch.empty(
-            inference_max_sequence_len,
-            batch_size,
-            self.num_gqa_groups_per_partition,
-            self.hidden_size_per_attention_head,
-            dtype=dtype,
-            device=torch.cuda.current_device(),
-        )
+        self._allocator = StaticBufferAllocator()
+
+    def alloc(self, size, dtype, device):
+        """
+            Allocated the buffer and works correctly with CUDA Graphs.
+        """
+        return self._allocator(size, dtype, device)
 
     def set_tensor_parallel_group(self, tp_group: Union[dist_group_type, None]) -> None:
         """
@@ -6360,25 +6596,13 @@ def forward(
         # Pre-allocate memory for key-values for inference
         # =================================================
 
-        if inference_params and self.layer_number is not None:
-            if self.layer_number not in inference_params.key_value_memory_dict:
-                inf_max_seq_len = inference_params.max_sequence_length
-                inf_max_batch_size = inference_params.max_batch_size
-                inference_key_memory = self._allocate_memory(
-                    inf_max_seq_len, inf_max_batch_size, hidden_states.dtype
-                )
-                inference_value_memory = self._allocate_memory(
-                    inf_max_seq_len, inf_max_batch_size, hidden_states.dtype
-                )
-                inference_params.key_value_memory_dict[self.layer_number] = (
-                    inference_key_memory,
-                    inference_value_memory,
-                )
-            else:
-                (
-                    inference_key_memory,
-                    inference_value_memory,
-                ) = inference_params.key_value_memory_dict[self.layer_number]
+        if inference_params is not None:
+            inference_params.allocate_memory_for_kv_cache_if_empty(
+                self.layer_number,
+                self.num_gqa_groups_per_partition,
+                self.hidden_size_per_attention_head,
+                hidden_states.dtype
+            )
 
         # ======================
         # Query, Key, and Value
@@ -6538,21 +6762,42 @@ def forward(
 
             q_pos_emb, k_pos_emb = rotary_pos_emb
 
-            # adjust key and value for inference
-            if inference_params is not None:
-                if self.qkv_format == "sbhd":
-                    sequence_length = key_layer.size(0)
-                elif self.qkv_format == "bshd":
-                    sequence_length = key_layer.size(1)
+            if self.qkv_format == "thd" and inference_params is not None:
+                # For thd attention incoming tokens can be on different positions,
+                # so we need to copy different positional encoding freqency
+                # for every sequence in a batch.
+                #
+                # For example if sequence lengths in context phase are: 2 and 5 (batch size=2),
+                # in first generation phase key_layer have shape [2, 1, d].
+                # key_layer[0, :] corresponds  to the token with position 3 = 2 + 1,
+                # and key_layer [1, :] corresponds  to the token with position 6 = 5 + 1.
+
+                query_layer = apply_rotary_pos_emb(
+                        query_layer, q_pos_emb, "bshd", fused=True,
+                        start_positions=inference_params.cached_sequence_lengths)
+                key_layer = apply_rotary_pos_emb(
+                    key_layer, k_pos_emb, "bshd", fused=True,
+                    start_positions=inference_params.cached_sequence_lengths)
+
+            else:
+                # adjust key and value for inference
+                if inference_params is not None:
+                    if self.qkv_format == "sbhd":
+                        sequence_length = key_layer.size(0)
+                    elif self.qkv_format == "bshd":
+                        sequence_length = key_layer.size(1)
+
+                    sequence_start = inference_params.sequence_len_offset
+                    sequence_end = sequence_start + sequence_length
 
-                sequence_start = inference_params.sequence_len_offset
-                sequence_end = sequence_start + sequence_length
+                    q_pos_emb = q_pos_emb[sequence_start:sequence_end, ...]
+                    k_pos_emb = k_pos_emb[sequence_start:sequence_end, ...]
 
-                q_pos_emb = q_pos_emb[sequence_start:sequence_end, ...]
-                k_pos_emb = k_pos_emb[sequence_start:sequence_end, ...]
+                query_layer = apply_rotary_pos_emb(
+                    query_layer, q_pos_emb, self.qkv_format, fused=True)
+                key_layer = apply_rotary_pos_emb(
+                    key_layer, k_pos_emb, self.qkv_format, fused=True)
 
-            query_layer = apply_rotary_pos_emb(query_layer, q_pos_emb, self.qkv_format, fused=True)
-            key_layer = apply_rotary_pos_emb(key_layer, k_pos_emb, self.qkv_format, fused=True)
 
         # ===========================
         # Core attention computation
@@ -6576,6 +6821,12 @@ def forward(
             inference_params=inference_params,
         )
 
+        if self.qkv_format == "thd":
+            # [b * sq, h] -> [qs, b, h]
+            context_layer  = context_layer.view(
+                (inference_params.max_batch_size, -1, context_layer.shape[1])
+            ).contiguous()
+
         # ===================
         # Output. [sq, b, h]
         # ===================
@@ -6596,3 +6847,20 @@ def forward(
         if self.input_layernorm and self.return_layernorm_output:
             outputs += (layernorm_output,)
         return outputs if len(outputs) > 1 else outputs[0]
+
+
+class StaticBufferAllocator(torch.nn.Module):
+    """
+        This class is used when we use te.make_graphed_callable().
+        CUDA Graphs require all tensors to be static. Neverthless,
+        torch API make_graphed_callable() takes care of output of torch modules,
+        and makes them static. Thus by wrapping allocation of memory into
+        torch.nn.Module, we can greatly simplify our code.
+    """
+
+    # pylint: disable=no-self-use
+    def forward(self, size, dtype, device):
+        """
+            Return buffer of given size, dtype and device.
+        """
+        return torch.zeros(size, dtype=dtype, device=device)
diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h
index f06b0cb197..40ec6959d2 100644
--- a/transformer_engine/pytorch/csrc/extensions.h
+++ b/transformer_engine/pytorch/csrc/extensions.h
@@ -357,16 +357,18 @@ void fused_amax_and_scale_update_after_reduction(const at::Tensor &amax_reductio
  **************************************************************************************************/
 
 at::Tensor fused_rope_forward(const at::Tensor &input, const at::Tensor &freqs,
+                              const at::Tensor &start_positions,
                               const bool transpose_output_memory);
 
 at::Tensor fused_rope_backward(const at::Tensor &output_grads, const at::Tensor &freqs,
+                               const at::Tensor &start_positions,
                                const bool transpose_output_memory);
 
 at::Tensor fused_rope_thd_forward(const at::Tensor &input, const at::Tensor &cu_seqlens,
-                                  const at::Tensor &freqs);
+                                  const at::Tensor &freqs, const at::Tensor &start_positions);
 
 at::Tensor fused_rope_thd_backward(const at::Tensor &output_grads, const at::Tensor &cu_seqlens,
-                                   const at::Tensor &freqs);
+                                   const at::Tensor &freqs, const at::Tensor &start_positions);
 
 /***************************************************************************************************
  * Miscellaneous
@@ -376,6 +378,17 @@ size_t get_cublasLt_version();
 
 size_t get_cudnn_version();
 
+bool userbuf_comm_available();
+
+void placeholder();
+
+/***************************************************************************************************
+ * Generation
+ **************************************************************************************************/
+
+void attention_copy(torch::Tensor A, torch::Tensor seq_len, torch::Tensor incoming_seq_len,
+                    torch::Tensor B, int max_incoming_seq_len, int max_seq_len, int b, int s);
+
 /***************************************************************************************************
  * Support THD format for Context Parallel
  **************************************************************************************************/
diff --git a/transformer_engine/pytorch/csrc/extensions/apply_rope.cu b/transformer_engine/pytorch/csrc/extensions/apply_rope.cu
index c58ba91d5e..8dc0545e26 100644
--- a/transformer_engine/pytorch/csrc/extensions/apply_rope.cu
+++ b/transformer_engine/pytorch/csrc/extensions/apply_rope.cu
@@ -7,6 +7,7 @@
 #include "extensions.h"
 
 at::Tensor fused_rope_forward(const at::Tensor &input, const at::Tensor &freqs,
+                              const at::Tensor &start_positions,
                               const bool transpose_output_memory) {
   using namespace transformer_engine;
   TORCH_CHECK(input.dim() == 4, "expected 4D tensor");
@@ -55,16 +56,19 @@ at::Tensor fused_rope_forward(const at::Tensor &input, const at::Tensor &freqs,
 
   auto input_cu = makeTransformerEngineTensor(input);
   auto freqs_cu = makeTransformerEngineTensor(freqs);
+  auto start_positions_cu = makeTransformerEngineTensor(start_positions);
   auto output_cu = makeTransformerEngineTensor(output);
 
-  nvte_fused_rope_forward(input_cu.data(), freqs_cu.data(), output_cu.data(), s, b, h, d, d2,
-                          stride_s, stride_b, stride_h, stride_d, o_stride_s, o_stride_b,
-                          o_stride_h, o_stride_d, at::cuda::getCurrentCUDAStream());
+  nvte_fused_rope_forward(input_cu.data(), freqs_cu.data(), start_positions_cu.data(),
+                          output_cu.data(), s, b, h, d, d2, stride_s, stride_b, stride_h, stride_d,
+                          o_stride_s, o_stride_b, o_stride_h, o_stride_d,
+                          at::cuda::getCurrentCUDAStream());
 
   return output;
 }
 
 at::Tensor fused_rope_backward(const at::Tensor &output_grads, const at::Tensor &freqs,
+                               const at::Tensor &start_positions,
                                const bool transpose_output_memory) {
   using namespace transformer_engine;
   TORCH_CHECK(output_grads.dim() == 4, "expected 4D tensor");
@@ -111,17 +115,19 @@ at::Tensor fused_rope_backward(const at::Tensor &output_grads, const at::Tensor
 
   auto output_grads_cu = makeTransformerEngineTensor(output_grads);
   auto freqs_cu = makeTransformerEngineTensor(freqs);
+  auto start_positions_cu = makeTransformerEngineTensor(start_positions);
   auto input_grads_cu = makeTransformerEngineTensor(input_grads);
 
-  nvte_fused_rope_backward(output_grads_cu.data(), freqs_cu.data(), input_grads_cu.data(), s, b, h,
-                           d, d2, stride_s, stride_b, stride_h, stride_d, o_stride_s, o_stride_b,
-                           o_stride_h, o_stride_d, at::cuda::getCurrentCUDAStream());
+  nvte_fused_rope_backward(output_grads_cu.data(), freqs_cu.data(), start_positions_cu.data(),
+                           input_grads_cu.data(), s, b, h, d, d2, stride_s, stride_b, stride_h,
+                           stride_d, o_stride_s, o_stride_b, o_stride_h, o_stride_d,
+                           at::cuda::getCurrentCUDAStream());
 
   return input_grads;
 }
 
 at::Tensor fused_rope_thd_forward(const at::Tensor &input, const at::Tensor &cu_seqlens,
-                                  const at::Tensor &freqs) {
+                                  const at::Tensor &freqs, const at::Tensor &start_positions) {
   using namespace transformer_engine;
   TORCH_CHECK(input.dim() == 3, "expected 3D tensor");
   TORCH_CHECK(cu_seqlens.dim() == 1, "expected 1D tensor");
@@ -163,16 +169,18 @@ at::Tensor fused_rope_thd_forward(const at::Tensor &input, const at::Tensor &cu_
   auto cu_seqlens_cu = makeTransformerEngineTensor(cu_seqlens);
   auto freqs_cu = makeTransformerEngineTensor(freqs);
   auto output_cu = makeTransformerEngineTensor(output);
+  auto start_positions_cu = makeTransformerEngineTensor(start_positions);
 
   nvte_fused_rope_thd_forward(input_cu.data(), cu_seqlens_cu.data(), freqs_cu.data(),
-                              output_cu.data(), max_s, b, h, d, d2, stride_t, stride_h, stride_d,
-                              o_stride_t, o_stride_h, o_stride_d, at::cuda::getCurrentCUDAStream());
+                              start_positions_cu.data(), output_cu.data(), max_s, b, h, d, d2,
+                              stride_t, stride_h, stride_d, o_stride_t, o_stride_h, o_stride_d,
+                              at::cuda::getCurrentCUDAStream());
 
   return output;
 }
 
 at::Tensor fused_rope_thd_backward(const at::Tensor &output_grads, const at::Tensor &cu_seqlens,
-                                   const at::Tensor &freqs) {
+                                   const at::Tensor &freqs, const at::Tensor &start_positions) {
   using namespace transformer_engine;
   TORCH_CHECK(output_grads.dim() == 3, "expected 3D tensor");
   TORCH_CHECK(cu_seqlens.dim() == 1, "expected 1D tensor");
@@ -212,10 +220,11 @@ at::Tensor fused_rope_thd_backward(const at::Tensor &output_grads, const at::Ten
   auto cu_seqlens_cu = makeTransformerEngineTensor(cu_seqlens);
   auto freqs_cu = makeTransformerEngineTensor(freqs);
   auto input_grads_cu = makeTransformerEngineTensor(input_grads);
+  auto start_positions_cu = makeTransformerEngineTensor(start_positions);
 
   nvte_fused_rope_thd_backward(output_grads_cu.data(), cu_seqlens_cu.data(), freqs_cu.data(),
-                               input_grads_cu.data(), max_s, b, h, d, d2, stride_t, stride_h,
-                               stride_d, o_stride_t, o_stride_h, o_stride_d,
+                               start_positions_cu.data(), input_grads_cu.data(), max_s, b, h, d, d2,
+                               stride_t, stride_h, stride_d, o_stride_t, o_stride_h, o_stride_d,
                                at::cuda::getCurrentCUDAStream());
 
   return input_grads;
diff --git a/transformer_engine/pytorch/csrc/extensions/generation.cu b/transformer_engine/pytorch/csrc/extensions/generation.cu
new file mode 100644
index 0000000000..5a162f1af6
--- /dev/null
+++ b/transformer_engine/pytorch/csrc/extensions/generation.cu
@@ -0,0 +1,55 @@
+/*************************************************************************
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See LICENSE for license information.
+ ************************************************************************/
+
+#include "extensions.h"
+
+// Kernel used to update KV chache when attention layout is "thd".
+template <typename scalar_t>
+__global__ void attention_copy_kernel(scalar_t* cache_tensor, int* seq_len, int* incoming_seq_len,
+                                      scalar_t* hidden_tensor, int max_incoming_seq_len,
+                                      int max_seq_len, int b, int s) {
+  for (int batch_idx = blockIdx.x; batch_idx < b; batch_idx += gridDim.x) {
+    int to_copy = s * incoming_seq_len[batch_idx];
+    int offset = seq_len[batch_idx];
+
+    scalar_t* begin_cache_copy = cache_tensor + max_seq_len * s * batch_idx + s * offset;
+    scalar_t* begin_hidden_copy = hidden_tensor + s * batch_idx * max_incoming_seq_len;
+
+    for (int i = threadIdx.x; i < to_copy; i += blockDim.x) {
+      *(begin_cache_copy + i) = *(begin_hidden_copy + i);
+    }
+  }
+}
+
+template <typename scalar_t>
+void attention_copy_launcher(torch::Tensor A, torch::Tensor seq_len, torch::Tensor incoming_seq_len,
+                             torch::Tensor B, int max_incoming_seq_len, int max_seq_len, int b,
+                             int s) {
+  attention_copy_kernel<<<16, 256, 0, at::cuda::getCurrentCUDAStream()>>>(
+      reinterpret_cast<scalar_t*>(A.data_ptr<scalar_t>()), seq_len.data_ptr<int>(),
+      incoming_seq_len.data_ptr<int>(), reinterpret_cast<scalar_t*>(B.data_ptr<scalar_t>()),
+      max_incoming_seq_len, max_seq_len, b, s);
+}
+
+void attention_copy(torch::Tensor A, torch::Tensor seq_len, torch::Tensor incoming_seq_len,
+                    torch::Tensor B, int max_incoming_seq_len, int max_seq_len, int b, int s) {
+  if (A.scalar_type() == at::ScalarType::Half) {
+    using dtype = at::Half;
+    attention_copy_launcher<dtype>(A, seq_len, incoming_seq_len, B, max_incoming_seq_len,
+                                   max_seq_len, b, s);
+
+  } else if (A.scalar_type() == at::ScalarType::BFloat16) {
+    using dtype = at::BFloat16;
+    attention_copy_launcher<dtype>(A, seq_len, incoming_seq_len, B, max_incoming_seq_len,
+                                   max_seq_len, b, s);
+  } else if (A.scalar_type() == at::ScalarType::Float) {
+    using dtype = float;
+    attention_copy_launcher<dtype>(A, seq_len, incoming_seq_len, B, max_incoming_seq_len,
+                                   max_seq_len, b, s);
+  } else {
+    NVTE_ERROR("Unsupported dtype of out\n");
+  }
+}
diff --git a/transformer_engine/pytorch/csrc/extensions/pybind.cpp b/transformer_engine/pytorch/csrc/extensions/pybind.cpp
index 89bce77ded..d250ce4484 100644
--- a/transformer_engine/pytorch/csrc/extensions/pybind.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/pybind.cpp
@@ -155,6 +155,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         py::call_guard<py::gil_scoped_release>());
   m.attr("_num_cublas_streams") = py::int_(transformer_engine::num_streams);
 
+  // Generation
+  m.def("attention_copy", &attention_copy, "attention_copy");
+
   // Support THD format for Context Parallel
   m.def("thd_read_half_tensor", &thd_read_half_tensor,
         "Read the first half(half_idx=0) or the second half(half_idx=1) of each sequence in a THD "
diff --git a/transformer_engine/pytorch/transformer.py b/transformer_engine/pytorch/transformer.py
index 130cf91f0e..3e077a4c07 100644
--- a/transformer_engine/pytorch/transformer.py
+++ b/transformer_engine/pytorch/transformer.py
@@ -184,6 +184,10 @@ class TransformerLayer(torch.nn.Module):
                          head size. Note that these formats are very closely
                          related to the `qkv_format` in the `MultiHeadAttention`
                          and `DotProductAttention` modules.
+                         Notion: The experimental version of the 'thd' attention is supported
+                         when :attr:`inference_params` is passed to the forward function.
+
+
 
     Parallelism parameters
     ----------------------
@@ -280,6 +284,9 @@ def __init__(
     ) -> None:
         super().__init__()
 
+        if ub_tp_comm_overlap:
+            assert tex.userbuf_comm_available(), "Userbuffer communication backend not available."
+
         self.self_attn_mask_type = self_attn_mask_type
         self.window_size = check_set_window_size(self_attn_mask_type, window_size)
         self.enc_dec_attn_mask_type = enc_dec_attn_mask_type