-- Filename: square_e.vhd
-- Created by HDL-SCHEM-Editor at Fri Oct 18 10:44:46 2024
-- ==============================
-- Description of used algorithm:
-- ==============================
-- When the square of a positive number A with the value "abcd" (with a,b,c,d from 0,1) has
-- to be calculated, we could use the common usual multiplication scheme:
--
-- A^2 = d*"0000abcd" +
--       c*"000abcd0" +
--       b*"00abcd00" +
--       a*"0abcd000"
--
-- For implementation 4 adders with each 4 bit would be needed.
-- But this effort can be reduced as the 2 numbers have the same digits.
-- In order to show a method to reduce the effort, the number A is written as follows:
--
-- A = a*2^3 + b*2^2 + c*2^1 + d
--
-- When we calculate A^2, the result is (be aware: a^n=a, b^n=b,...):
--
-- A^2 = a*2^3 * (a*2^3 + b*2^2 + c*2^1 + d) +
--       b*2^2 * (a*2^3 + b*2^2 + c*2^1 + d) +
--       c*2^1 * (a*2^3 + b*2^2 + c*2^1 + d) +
--       d     * (a*2^3 + b*2^2 + c*2^1 + d) =
--
--     =    a*2^6 + a*b*2^5 + a*c*2^4 + a*d*2^3 +
--                  a*b*2^5 +   b*2^4 + b*c*2^3 + b*d*2^2 +
--                            a*c*2^4 + b*c*2^3 +   c*2^2 + c*d*2^1 +
--                                      a*d*2^3 + b*d*2^2 + c*d*2^1 + d =
--
--     =    a*2^6 + b*2^4 + c*2^2 + d +
--        a*b*2^6 + a*c*2^5 + a*d*2^4 + b*c*2^4 + b*d*2^3 + c*d*2^2 =
--
--     =    a*2^6 + b*2^4 + c*2^2 + d +
--       a*(b*2^6 + c*2^5 + d*2^4) + b*(c*2^4 + d*2^3) + c*d*2^2 =
--
--     =    a*2^6 + 0*2^5 + b*2^4 + 0*2^3 + c*2^2 + 0*2^1 + d +
--       a*(b*2^6 + c*2^5 + d*2^4)+
--       b*(c*2^4 + d*2^3)+
--       c*(d*2^2) =
--
--     = c*"0000d00" +
--       b*"00cd000" +
--       a*"bcd0000" +
--         "a0b0c0d"
--
-- In case of 8 bit the scheme looks like:
--
-- A^2 =   000000000000000 +      Adding 0 makes no sense here, but is needed later.
--       g*000000000000h00 +
--       f*0000000000gh000 +
--       e*00000000fgh0000 +
--       d*000000efgh00000 +
--       c*0000defgh000000 +
--       b*00cdefgh0000000 +
--       a*bcdefgh00000000 +
--         a0b0c0d0e0f0g0h
--
-- Now the scheme is expanded by showing the intermediate sums:
--
-- A^2 =   000000000000000 +
--       g*000000000000h00 =
--         ---------------
--                    0h00 +
--       f*0000000000gh000 =
--         ---------------
--                  0ghh00 +
--       e*00000000fgh0000 =
--         ---------------
--                sssshh00 +
--       d*000000efgh00000 =
--         ---------------
--              sssssshh00 +
--       c*0000defgh000000 =
--         ---------------
--            sssssssshh00 +
--       b*00cdefgh0000000 =
--         ---------------
--          sssssssssshh00 +
--       a*bcdefgh00000000 =
--         ---------------
--         ssssssssssshh00 +
--         a0b0c0d0e0f0g0h =         "spread" operand
--         ---------------
--         SSSSSSSSSSSSSSS
--
-- Now the bits of the "spread" operand are shifted to other places and the last addition is removed:
--
-- A^2 =   000000000000g00 +          Sum-bits which have reached its end result are named 'S'.
--       g*000000000000h00 =
--         ---------------
--                   fsS00 +
--       f*0000000000gh000 =
--         ---------------
--                 essSS00 +
--       e*00000000fgh0000 =
--         ---------------
--               dsssSSS00 +
--       d*000000efgh00000 =
--         ---------------
--             cssssSSSS00 +
--       c*0000defgh000000 =
--         ---------------
--           bsssssSSSSS00 +
--       b*00cdefgh0000000 =
--         ---------------
--         assssssSSSSSS00 +
--       a*bcdefgh0000000h =
--         ---------------
--         SSSSSSSSSSSSS0h
--
-- With the common multiplication scheme, a negative multiplicand (in 2's complement) is handled correctly,
-- but a negative multiplier, which is handled bit by bit as usual, is too big by 2^n,
-- and the product must be reduced by 2^n*multiplicand at the end.
-- But the situation is different in the optimized square scheme. Here both number are interpreted as positive numbers.
-- If the operand k is a negative integer, the binary value of k (in 2's complement) was calculated by 2^n+k,
-- where n is the number of bits of the 2's complement. So the square algorithm calculates:
--
-- square = (2^n + k)^2 =
--        = 2^2*n + k*2^(n+1) + k^2
--
-- As square has 2*n-1 bits at maximum, the summand 2^2*n can be ignored.
-- But the term k*2^(n+1) must be subtracted from the result.
--
-- In this 8 bit example (n=8) the value k*2^(n+1), limited to 15 bit, calculates to:
--
-- fix = k*2^(n+1) = abcdefgh * 2^(9) = cdefgh000000000
--
-- As this value must be subtracted, the 2's complement of the value must be added:
--
-- -fix = not (cdefgh000000000) + 1
--
-- This new addition, depending on bit a which signals a negative number, is added last to the algorithm:
--
-- A^2 =   000000000000g00 +     Sum-bits which have reached its end result are named 'S'.
--       g*000000000000h00 =
--         ---------------
--                   fsS00 +
--       f*0000000000gh000 =
--         ---------------
--                 essSS00 +
--       e*00000000fgh0000 =
--         ---------------
--               dsssSSS00 +
--       d*000000efgh00000 =
--         ---------------
--             cssssSSSS00 +
--       c*0000defgh000000 =
--         ---------------
--           bsssssSSSSS00 +
--       b*00cdefgh0000000 =
--         ---------------
--         assssssSSSSSS00 +
--       a*bcdefgh0000000h =
--         ---------------
--         ssssssSSSSSSS0h +
-- a * not(cdefgh)               <- CarryIn=a at the least significant bit
--         ---------------
--         SSSSSSSSSSSSS0h
--
-- In each addition parts of the original operand are shifted and added.
-- In order to avoid shifting the operand to the left, the intermediate sums are shifted right.
-- Least signficant bits with always value 0 are not shown any more:
--
-- A^2 =         000000g  +          Sum-bits which have reached its end result are named 'S'.
--             g*000000h  =          1 adder
--               -------
--               00000sS  -> shift
--                    fsS +
--             f*00000gh  =          2 adder
--               -------
--               0000ssS  -> shift
--                   essS +
--             e*0000fgh  =          3 adder
--               -------
--               000sssS  -> shift
--                  dsssS +
--             d*000efgh  =          4 adder
--               -------
--               00ssssS  -> shift
--                 cssssS +
--             c*00defgh  =          5 adder
--               -------
--               0sssssS  -> shift
--                bsssssS +
--             b*0cdefgh  =          6 adder
--               -------
--               ssssssS  -> shift
--               assssssS +
--             a*bcdefgh  =          7 adder
--               -------
--               ssssssS  -> shift
--               assssssS +
--       a * not(bcdefgh) =          6 adder   <- CarryIn=a at the least significant bit
--               -------
--               _SSSSSS.......0h    The '.'s are filled with the 'S's calculated before.
--
-- While the multiplication of two 8 bit number needs 8*8=64 adders,
-- the optimized algorithm for calculation the square of a 8 bit number only needs 34 adders.
--
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
entity square is
    generic (
        constant g_operand_width : natural := 16;-- Allowed values: 2...n
        constant g_latency       : natural := 16 -- Allowed values: 0...k
    );
    port (
        clk_i     : in  std_logic;
        operand_i : in  signed(g_operand_width-1 downto 0);
        res_i     : in  std_logic;
        start_i   : in  std_logic;
        ready_o   : out std_logic;
        square_o  : out signed(2*g_operand_width-1 downto 0)
    );
end entity square;
