IntroductionThe following VHDL is for the minimal nibzF version. Clock around 30 MHz. -- Nibz Moore State Machine
-- (C) K Ring Technologies Semiconductor 2008-2009
-- http://nibz.googlecode.com
-- BSD or 1 core per chip licence
-- 1 core per chip (two conditions)
-- a. Print K Ring Technologies logo on
-- or near chip at any resolution.
-- b. Any documentation must acknowledge copyright
-- and have http://nibz.googlecode.com URL.
-- Maintained by Simon Jackson, BEng.
-- E-mail: jackokring@gmail.com
-- A Moore machine's outputs are dependent only on the current state.
-- The output is written only when the state changes. (State
-- transitions are synchronous.)
-- A note on sensitivity lists...
-- ==============================
-- Please note the sensitivity lists are not all encompassing.
-- This may lead to excess power density in high density VLSI
-- chips. The simulation of the design does not display the
-- transitions of signals which are not critical in evaluating
-- some state outputs. If your sythesis tool supports this low
-- power technique, then good, your ok. If it does not then
-- all is not lost. You have two options. Either add all signals
-- needed to supress warnings to the sensitivity list, or
-- just sythesize and treat the simulation Fmax with respect.
-- Some of the fan-outs are high, excedding 30. This
-- is a big number for fan-out. Sythesis for speed may insert
-- fan-out split drivers for speed, resulting in better
-- performance for little extra area.
-- The design was developed on Quartus II 6.1 - 8.0
-- in MAX IIZ CPLD technology. It is not limited to this.
-- With low area optimization an fmax of 30 MHz is
-- not out of reach. I have not gone to the fast optimization
-- have a go if you need the speed, but make sure your
-- memory is up to the challenge. Try 10 MHz as an easy
-- target.
library ieee;
use ieee.std_logic_1164.all;
use ieee.std_logic_arith.all;
entity nibzF is
generic(
wide : natural := 16
-- the generic processor width
);
port(
-- SoC WISHBONE Modified Interface
RST_I : in std_logic;
CLK_I : in std_logic;
A_O : buffer std_logic_vector(wide downto 0);
D_I : in std_logic_vector((wide-1)/2 downto 0);
D_O : out std_logic_vector((wide-1)/2 downto 0);
RW_O : buffer std_logic;
-- '0' on write, '1' on read
CYC_I : in std_logic;
IRQ_I : in std_logic
);
end entity;
architecture rtl of nibzF is
-- Build an enumerated type for the state machine
type cycle_type is (fetch, execute);
-- Direct selector
type reg_seld is (dirp, dirq, dirr, dirs,
dira, dirx, dirad);
-- Indirection selector
type reg_seli is (indp, indq, indr, inds);
type mem_op is (rd, wr);
type mem_side is (lo, hi);
-- Register to hold the current state
signal cycle : cycle_type;
-- The register set
signal p, q, r, s : std_logic_vector(wide-1 downto 0);
signal a, ir : std_logic_vector(wide-1 downto 0);
-- The control signals
signal ind : reg_seli;
signal wrt : mem_op;
signal pre : std_logic_vector(wide-1 downto 0);
signal dir : reg_seld;
signal xout : std_logic_vector(wide-1 downto 0);
signal adrtmp : std_logic_vector(wide-1 downto 0);
signal din : std_logic_vector(wide-1 downto 0);
signal dout : std_logic_vector(wide-1 downto 0);
-- The Half Width Data Bus
signal din_lo : std_logic_vector((wide-1)/2 downto 0);
signal swaprw : std_logic; -- swap write high for read low
signal swaprw2 : std_logic;
signal hilo : mem_side;
signal addrw : std_logic_vector(wide-1 downto 0);
signal addrr : std_logic_vector(wide-1 downto 0);
-- Useful constants
constant z : std_logic_vector(wide-1 downto 0) := (others => '0');
constant z4 : std_logic_vector(wide-5 downto 0) := (others => '0');
constant nul : std_logic_vector(wide-1 downto 0) := (others => 'Z');
-- alu
signal x0 : std_logic_vector(wide-1 downto 0);
signal a0 : std_logic_vector(wide-1 downto 0);
signal x1 : std_logic_vector(wide-1 downto 0);
signal a1 : std_logic_vector(wide-1 downto 0);
signal car : std_logic_vector(wide-1 downto 0);
signal ctmp : std_logic_vector(wide-1 downto 0);
-- alu control
signal cin : std_logic;
-- reset delay
signal rs1, rs2 : std_logic;
-- irq control
signal irq : std_logic;
begin
-- reset delay
process (CLK_I)
begin
if(rising_edge(CLK_I)) then
rs2 <= rs1;
rs1 <= RST_I;
end if;
end process;
-- Logic to advance to the next state
process (CLK_I, rs2)
begin
if(rs2 = '1') then
cycle <= fetch; -- so that clock causes fetch
ind <= indp; -- program fetch
wrt <= rd;
pre <= z; -- register first address
p <= z;
q <= z;
r <= z;
s <= z;
a <= z;
ir <= z;
hilo <= lo;
swaprw <= '0';
swaprw2 <= '0';
irq <= '0';
elsif (rising_edge(CLK_I)) then
-- when not waiting cycle processor
if(CYC_I = '1' and swaprw = '0') then
if(wrt = wr) then
if(hilo = lo) then
case ind is
--pre decrement??
when indp =>
-- not used so make blank
when indq =>
q <= pre;
when indr =>
r <= pre;
when inds =>
s <= pre;
end case;
end if;
else
if(hilo = hi) then
case ind is
--post increment
when indp =>
p <= adrtmp;
when indq =>
q <= adrtmp;
when indr =>
r <= adrtmp;
when inds =>
s <= adrtmp;
end case;
end if;
end if;
-- right time to complete execution?
if((wrt = wr and hilo = lo) or (wrt = rd and hilo = hi)) then
case cycle is
when execute =>
case dir is
when dirp =>
if(irq = '1') then
irq <= '0';
end if;
p <= din;
when dirq =>
if(wrt=rd) then
q <= din;
end if;
when dirr =>
if(wrt=rd) then
r <= din;
end if;
when dirs =>
if(wrt=rd) then
s <= din;
end if;
when dira =>
if(wrt=rd) then
a <= din;
end if;
when dirx =>
a <= xout;
if(not(ir(3 downto 2) = "01")) then
-- not on xor
cin <= ctmp(wide-1);
end if;
when dirad =>
p <= ir;
end case;
cycle <= fetch;
ind <= indp; -- program fetch setup
wrt <= rd;
pre <= p;
when fetch =>
if(not(din(wide-1 downto 4)=z4)) then
-- jump (there are no branch delay slots!!)
ind <= indr;
wrt <= wr;
dir <= dirad;
pre <= unsigned(r) - 1;
-- botch to do post increment of fetch
dout <= unsigned(p) + 1;
else
case din(3 downto 0) is
when "0000" =>
-- BAck (no delay)
if(irq = '1') then
ind <= indp;
pre <= p;
else
ind <= indr;
pre <= r;
end if;
wrt <= rd;
dir <= dirp;
when "0001" =>
-- Fetch In
ind <= indq;
wrt <= rd;
dir <= dira;
pre <= q;
when "0010" =>
-- Return In
ind <= indr;
wrt <= rd;
dir <= dirq;
pre <= r;
when "0011" =>
-- Stack In
ind <= inds;
wrt <= rd;
dir <= dira;
pre <= s;
when "0100" =>
-- DIfference
ind <= inds;
wrt <= rd;
dir <= dirx;
pre <= s;
when "0101" =>
-- Fetch Address
ind <= inds;
wrt <= rd;
dir <= dirq;
pre <= s;
when "0110" =>
-- Return Address
ind <= inds;
wrt <= rd;
dir <= dirr;
pre <= s;
when "0111" =>
-- Stack Address
ind <= inds;
wrt <= rd;
dir <= dirs;
pre <= s;
when "1000" =>
-- BOth (AND 2*)
ind <= inds;
wrt <= rd;
dir <= dirx;
pre <= s;
when "1001" =>
-- Fetch Out
ind <= indq;
wrt <= wr;
dir <= dira;
pre <= unsigned(q) - 1;
dout <= a;
when "1010" =>
-- Return Out
ind <= indr;
wrt <= wr;
dir <= dirq;
pre <= unsigned(r) - 1;
dout <= q;
when "1011" =>
-- Stack Out
ind <= inds;
wrt <= wr;
dir <= dira;
pre <= unsigned(s) - 1;
dout <= a;
when "1100" =>
-- SUm
ind <= inds;
wrt <= rd;
dir <= dirx;
pre <= s;
when "1101" =>
-- Fetch Ends
ind <= inds;
wrt <= wr;
dir <= dirq;
pre <= unsigned(s) - 1;
dout <= q;
when "1110" =>
-- Return Ends
ind <= inds;
wrt <= wr;
dir <= dirr;
pre <= unsigned(s) - 1;
dout <= r;
when "1111" =>
-- Stack Ends
ind <= inds;
wrt <= wr;
dir <= dirs;
pre <= unsigned(s) - 1;
dout <= s;
end case;
end if;
ir <= din;
cycle <= execute;
end case;
end if;
-- IO control
swaprw <= swaprw2;
if(hilo = lo) then
if(wrt = rd) then
-- read lo
-- pre is correct
hilo <= hi;
addrr <= pre;
din_lo <= D_I;
if(swaprw2 = '1') then
-- perform hi write delayed setup
-- revert pre for completion
pre <= addrw;
wrt <= wr;
swaprw2 <= '0';
else
-- perform hi read setup
end if;
else
-- write lo
swaprw2 <= '1';
addrw <= pre;
dout((wide-1)/2 downto 0) <= dout(wide-1 downto wide/2);
-- pre is correct
-- instruction completes
end if;
else
-- read hi
-- instruction completes
hilo <= lo;
end if;
elsif(CYC_I = '1' and swaprw = '1') then
-- clear delayed write
swaprw <= swaprw2;
pre <= addrr;
wrt <= rd;
end if;
-- set interrupt
if(IRQ_I = '1') then
irq <= '1';
end if;
end if;
end process;
process(ir, a, din, car, cin)
-- alu process
-- for greater energy efficiency
-- re-evaluation of the ALU on each cycle
-- may be avoided if some latches are generated
-- not done in this version.
begin
case ir(3 downto 2) is
when "00" =>
-- first row of half adders
x0 <= z;
a0 <= z;
-- second row of half adders
x1 <= x0 xor car;
a1 <= z;
when "01" =>
-- first row of half adders
x0 <= a xor din;
a0 <= z;
-- second row of half adders
x1 <= x0 xor car;
a1 <= z;
when "10" =>
-- first row of half adders
x0 <= z;
a0 <= a and din;
-- second row of half adders
x1 <= x0 xor car;
a1 <= x0 and car;
when "11" =>
-- first row of half adders
x0 <= a xor din;
a0 <= a and din;
-- second row of half adders
x1 <= x0 xor car;
a1 <= x0 and car;
end case;
-- STANDARD CARRY
ctmp <= a1 or a0;
car <= ctmp(wide-2 downto 0)&cin;
--sum output
xout <= x1(wide-1 downto 0);
end process;
process(pre, wrt, hilo, D_I, din_lo, dout)
-- read/write process
-- moved indirect assignment here
-- to allow decode of instruction
begin
-- decrement
if(wrt = wr) then
RW_O <= '0';
else
RW_O <= '1';
end if;
-- possible post increment calculation
adrtmp <= unsigned(pre) + 1;
-- A_O from pre
if(hilo = hi) then
A_O <= pre&'0';
else
A_O <= pre&'1';
end if;
D_O <= dout(wide/2-1 downto 0);
din <= D_I&din_lo;
end process;
end rtl;
|