From 0020e355b079710113cef1fcf1823f76e348ef8a Mon Sep 17 00:00:00 2001 From: Matheus Cavalcante Date: Thu, 8 Jul 2021 13:07:55 +0200 Subject: [PATCH 01/20] [hardware] Add shuffle indexes for 2, 8, and 16 lanes --- hardware/include/ara_pkg.sv | 266 +++++++++++++++++++++++++++++++++++- 1 file changed, 264 insertions(+), 2 deletions(-) diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv index ab91e3bfc..ee30f4bd7 100644 --- a/hardware/include/ara_pkg.sv +++ b/hardware/include/ara_pkg.sv @@ -206,7 +206,8 @@ package ara_pkg; elen_t scalar_op; logic use_scalar_op; - // 2nd scalar operand: stride for constant-strided vector load/stores, slide offset for vector slides + // 2nd scalar operand: stride for constant-strided vector load/stores, slide offset for vector + // slides elen_t stride; // Destination vector register @@ -381,6 +382,40 @@ package ara_pkg; return idx[byte_idx[2:0]]; end endcase + 2: unique case (ew) + rvv_pkg::EW64: begin + automatic vlen_t [15:0] idx; + idx[15] = 15; idx[14] = 14; idx[13] = 13; idx[12] = 12; + idx[11] = 11; idx[10] = 10; idx[09] = 09; idx[08] = 08; + idx[07] = 07; idx[06] = 06; idx[05] = 05; idx[04] = 04; + idx[03] = 03; idx[02] = 02; idx[01] = 01; idx[00] = 00; + return idx[byte_idx[3:0]]; + end + rvv_pkg::EW32: begin + automatic vlen_t [15:0] idx; + idx[15] = 15; idx[14] = 14; idx[13] = 13; idx[12] = 12; + idx[11] = 07; idx[10] = 06; idx[09] = 05; idx[08] = 04; + idx[07] = 11; idx[06] = 10; idx[05] = 09; idx[04] = 08; + idx[03] = 03; idx[02] = 02; idx[01] = 01; idx[00] = 00; + return idx[byte_idx[3:0]]; + end + rvv_pkg::EW16: begin + automatic vlen_t [15:0] idx; + idx[15] = 15; idx[14] = 14; idx[13] = 07; idx[12] = 06; + idx[11] = 11; idx[10] = 10; idx[09] = 03; idx[08] = 02; + idx[07] = 13; idx[06] = 12; idx[05] = 05; idx[04] = 04; + idx[03] = 09; idx[02] = 08; idx[01] = 01; idx[00] = 00; + return idx[byte_idx[3:0]]; + end + rvv_pkg::EW8: begin + automatic vlen_t [15:0] idx; + idx[15] = 15; idx[14] = 07; idx[13] = 11; idx[12] = 03; + idx[11] = 13; idx[10] = 05; idx[09] = 09; idx[08] = 01; + idx[07] = 14; idx[06] = 06; idx[05] = 10; idx[04] = 02; + idx[03] = 12; idx[02] = 04; idx[01] = 08; idx[00] = 00; + return idx[byte_idx[3:0]]; + end + endcase 4: unique case (ew) rvv_pkg::EW64: begin automatic vlen_t [31:0] idx; @@ -431,7 +466,234 @@ package ara_pkg; return idx[byte_idx[4:0]]; end endcase - // TODO: Remaining NrLanes. Generalize this case accordingly to the function below. + 8: unique case (ew) + rvv_pkg::EW64: begin + automatic vlen_t [63:0] idx; + idx[63] = 63; idx[62] = 62; idx[61] = 61; idx[60] = 60; + idx[59] = 59; idx[58] = 58; idx[57] = 57; idx[56] = 56; + idx[55] = 55; idx[54] = 54; idx[53] = 53; idx[52] = 52; + idx[51] = 51; idx[50] = 50; idx[49] = 49; idx[48] = 48; + idx[47] = 47; idx[46] = 46; idx[45] = 45; idx[44] = 44; + idx[43] = 43; idx[42] = 42; idx[41] = 41; idx[40] = 40; + idx[39] = 39; idx[38] = 38; idx[37] = 37; idx[36] = 36; + idx[35] = 35; idx[34] = 34; idx[33] = 33; idx[32] = 32; + idx[31] = 31; idx[30] = 30; idx[29] = 29; idx[28] = 28; + idx[27] = 27; idx[26] = 26; idx[25] = 25; idx[24] = 24; + idx[23] = 23; idx[22] = 22; idx[21] = 21; idx[20] = 20; + idx[19] = 19; idx[18] = 18; idx[17] = 17; idx[16] = 16; + idx[15] = 15; idx[14] = 14; idx[13] = 13; idx[12] = 12; + idx[11] = 11; idx[10] = 10; idx[09] = 09; idx[08] = 08; + idx[07] = 07; idx[06] = 06; idx[05] = 05; idx[04] = 04; + idx[03] = 03; idx[02] = 02; idx[01] = 01; idx[00] = 00; + return idx[byte_idx[5:0]]; + end + rvv_pkg::EW32: begin + automatic vlen_t [63:0] idx; + idx[63] = 63; idx[62] = 62; idx[61] = 61; idx[60] = 60; + idx[59] = 55; idx[58] = 54; idx[57] = 53; idx[56] = 52; + idx[55] = 47; idx[54] = 46; idx[53] = 45; idx[52] = 44; + idx[51] = 39; idx[50] = 38; idx[49] = 37; idx[48] = 36; + idx[47] = 31; idx[46] = 30; idx[45] = 29; idx[44] = 28; + idx[43] = 23; idx[42] = 22; idx[41] = 21; idx[40] = 20; + idx[39] = 15; idx[38] = 14; idx[37] = 13; idx[36] = 12; + idx[35] = 07; idx[34] = 06; idx[33] = 05; idx[32] = 04; + idx[31] = 59; idx[30] = 58; idx[29] = 57; idx[28] = 56; + idx[27] = 51; idx[26] = 50; idx[25] = 49; idx[24] = 48; + idx[23] = 43; idx[22] = 42; idx[21] = 41; idx[20] = 40; + idx[19] = 35; idx[18] = 34; idx[17] = 33; idx[16] = 32; + idx[15] = 27; idx[14] = 26; idx[13] = 25; idx[12] = 24; + idx[11] = 19; idx[10] = 18; idx[09] = 17; idx[08] = 16; + idx[07] = 11; idx[06] = 10; idx[05] = 09; idx[04] = 08; + idx[03] = 03; idx[02] = 02; idx[01] = 01; idx[00] = 00; + return idx[byte_idx[5:0]]; + end + rvv_pkg::EW16: begin + automatic vlen_t [63:0] idx; + idx[63] = 63; idx[62] = 62; idx[61] = 55; idx[60] = 54; + idx[59] = 47; idx[58] = 46; idx[57] = 39; idx[56] = 38; + idx[55] = 31; idx[54] = 30; idx[53] = 23; idx[52] = 22; + idx[51] = 15; idx[50] = 14; idx[49] = 07; idx[48] = 06; + idx[47] = 59; idx[46] = 58; idx[45] = 51; idx[44] = 50; + idx[43] = 43; idx[42] = 42; idx[41] = 35; idx[40] = 34; + idx[39] = 27; idx[38] = 26; idx[37] = 19; idx[36] = 18; + idx[35] = 11; idx[34] = 10; idx[33] = 03; idx[32] = 02; + idx[31] = 61; idx[30] = 60; idx[29] = 53; idx[28] = 52; + idx[27] = 45; idx[26] = 44; idx[25] = 37; idx[24] = 36; + idx[23] = 29; idx[22] = 28; idx[21] = 21; idx[20] = 20; + idx[19] = 13; idx[18] = 12; idx[17] = 05; idx[16] = 04; + idx[15] = 57; idx[14] = 56; idx[13] = 49; idx[12] = 48; + idx[11] = 41; idx[10] = 40; idx[09] = 33; idx[08] = 32; + idx[07] = 25; idx[06] = 24; idx[05] = 17; idx[04] = 16; + idx[03] = 09; idx[02] = 08; idx[01] = 01; idx[00] = 00; + return idx[byte_idx[5:0]]; + end + rvv_pkg::EW8: begin + automatic vlen_t [63:0] idx; + idx[63] = 63; idx[62] = 55; idx[61] = 47; idx[60] = 39; + idx[59] = 31; idx[58] = 23; idx[57] = 15; idx[56] = 07; + idx[55] = 59; idx[54] = 51; idx[53] = 43; idx[52] = 35; + idx[51] = 27; idx[50] = 19; idx[49] = 11; idx[48] = 03; + idx[47] = 61; idx[46] = 53; idx[45] = 45; idx[44] = 37; + idx[43] = 29; idx[42] = 21; idx[41] = 13; idx[40] = 05; + idx[39] = 57; idx[38] = 49; idx[37] = 41; idx[36] = 33; + idx[35] = 25; idx[34] = 17; idx[33] = 09; idx[32] = 01; + idx[31] = 62; idx[30] = 54; idx[29] = 46; idx[28] = 38; + idx[27] = 30; idx[26] = 22; idx[25] = 14; idx[24] = 06; + idx[23] = 58; idx[22] = 50; idx[21] = 42; idx[20] = 34; + idx[19] = 26; idx[18] = 18; idx[17] = 10; idx[16] = 02; + idx[15] = 60; idx[14] = 52; idx[13] = 44; idx[12] = 36; + idx[11] = 28; idx[10] = 20; idx[09] = 12; idx[08] = 04; + idx[07] = 56; idx[06] = 48; idx[05] = 40; idx[04] = 32; + idx[03] = 24; idx[02] = 16; idx[01] = 08; idx[00] = 00; + return idx[byte_idx[5:0]]; + end + endcase + 16: unique case (ew) + rvv_pkg::EW64: begin + automatic vlen_t [127:0] idx; + idx[127] = 127; idx[126] = 126; idx[125] = 125; idx[124] = 124; + idx[123] = 123; idx[122] = 122; idx[121] = 121; idx[120] = 120; + idx[119] = 119; idx[118] = 118; idx[117] = 117; idx[116] = 116; + idx[115] = 115; idx[114] = 114; idx[113] = 113; idx[112] = 112; + idx[111] = 111; idx[110] = 110; idx[109] = 109; idx[108] = 108; + idx[107] = 107; idx[106] = 106; idx[105] = 105; idx[104] = 104; + idx[103] = 103; idx[102] = 102; idx[101] = 101; idx[100] = 100; + idx[099] = 099; idx[098] = 098; idx[097] = 097; idx[096] = 096; + idx[095] = 095; idx[094] = 094; idx[093] = 093; idx[092] = 092; + idx[091] = 091; idx[090] = 090; idx[089] = 089; idx[088] = 088; + idx[087] = 087; idx[086] = 086; idx[085] = 085; idx[084] = 084; + idx[083] = 083; idx[082] = 082; idx[081] = 081; idx[080] = 080; + idx[079] = 079; idx[078] = 078; idx[077] = 077; idx[076] = 076; + idx[075] = 075; idx[074] = 074; idx[073] = 073; idx[072] = 072; + idx[071] = 071; idx[070] = 070; idx[069] = 069; idx[068] = 068; + idx[067] = 067; idx[066] = 066; idx[065] = 065; idx[064] = 064; + idx[063] = 063; idx[062] = 062; idx[061] = 061; idx[060] = 060; + idx[059] = 059; idx[058] = 058; idx[057] = 057; idx[056] = 056; + idx[055] = 055; idx[054] = 054; idx[053] = 053; idx[052] = 052; + idx[051] = 051; idx[050] = 050; idx[049] = 049; idx[048] = 048; + idx[047] = 047; idx[046] = 046; idx[045] = 045; idx[044] = 044; + idx[043] = 043; idx[042] = 042; idx[041] = 041; idx[040] = 040; + idx[039] = 039; idx[038] = 038; idx[037] = 037; idx[036] = 036; + idx[035] = 035; idx[034] = 034; idx[033] = 033; idx[032] = 032; + idx[031] = 031; idx[030] = 030; idx[029] = 029; idx[028] = 028; + idx[027] = 027; idx[026] = 026; idx[025] = 025; idx[024] = 024; + idx[023] = 023; idx[022] = 022; idx[021] = 021; idx[020] = 020; + idx[019] = 019; idx[018] = 018; idx[017] = 017; idx[016] = 016; + idx[015] = 015; idx[014] = 014; idx[013] = 013; idx[012] = 012; + idx[011] = 011; idx[010] = 010; idx[009] = 009; idx[008] = 008; + idx[007] = 007; idx[006] = 006; idx[005] = 005; idx[004] = 004; + idx[003] = 003; idx[002] = 002; idx[001] = 001; idx[000] = 000; + return idx[byte_idx[6:0]]; + end + rvv_pkg::EW32: begin + automatic vlen_t [127:0] idx; + idx[127] = 127; idx[126] = 126; idx[125] = 125; idx[124] = 124; + idx[123] = 119; idx[122] = 118; idx[121] = 117; idx[120] = 116; + idx[119] = 111; idx[118] = 110; idx[117] = 109; idx[116] = 108; + idx[115] = 103; idx[114] = 102; idx[113] = 101; idx[112] = 100; + idx[111] = 095; idx[110] = 094; idx[109] = 093; idx[108] = 092; + idx[107] = 087; idx[106] = 086; idx[105] = 085; idx[104] = 084; + idx[103] = 079; idx[102] = 078; idx[101] = 077; idx[100] = 076; + idx[099] = 071; idx[098] = 070; idx[097] = 069; idx[096] = 068; + idx[095] = 063; idx[094] = 062; idx[093] = 061; idx[092] = 060; + idx[091] = 055; idx[090] = 054; idx[089] = 053; idx[088] = 052; + idx[087] = 047; idx[086] = 046; idx[085] = 045; idx[084] = 044; + idx[083] = 039; idx[082] = 038; idx[081] = 037; idx[080] = 036; + idx[079] = 031; idx[078] = 030; idx[077] = 029; idx[076] = 028; + idx[075] = 023; idx[074] = 022; idx[073] = 021; idx[072] = 020; + idx[071] = 015; idx[070] = 014; idx[069] = 013; idx[068] = 012; + idx[067] = 007; idx[066] = 006; idx[065] = 005; idx[064] = 004; + idx[063] = 123; idx[062] = 122; idx[061] = 121; idx[060] = 120; + idx[059] = 115; idx[058] = 114; idx[057] = 113; idx[056] = 112; + idx[055] = 107; idx[054] = 106; idx[053] = 105; idx[052] = 104; + idx[051] = 099; idx[050] = 098; idx[049] = 097; idx[048] = 096; + idx[047] = 091; idx[046] = 090; idx[045] = 089; idx[044] = 088; + idx[043] = 083; idx[042] = 082; idx[041] = 081; idx[040] = 080; + idx[039] = 075; idx[038] = 074; idx[037] = 073; idx[036] = 072; + idx[035] = 067; idx[034] = 066; idx[033] = 065; idx[032] = 064; + idx[031] = 059; idx[030] = 058; idx[029] = 057; idx[028] = 056; + idx[027] = 051; idx[026] = 050; idx[025] = 049; idx[024] = 048; + idx[023] = 043; idx[022] = 042; idx[021] = 041; idx[020] = 040; + idx[019] = 035; idx[018] = 034; idx[017] = 033; idx[016] = 032; + idx[015] = 027; idx[014] = 026; idx[013] = 025; idx[012] = 024; + idx[011] = 019; idx[010] = 018; idx[009] = 017; idx[008] = 016; + idx[007] = 011; idx[006] = 010; idx[005] = 009; idx[004] = 008; + idx[003] = 003; idx[002] = 002; idx[001] = 001; idx[000] = 000; + return idx[byte_idx[6:0]]; + end + rvv_pkg::EW16: begin + automatic vlen_t [127:0] idx; + idx[127] = 127; idx[126] = 126; idx[125] = 119; idx[124] = 118; + idx[123] = 111; idx[122] = 110; idx[121] = 103; idx[120] = 102; + idx[119] = 095; idx[118] = 094; idx[117] = 087; idx[116] = 086; + idx[115] = 079; idx[114] = 078; idx[113] = 071; idx[112] = 070; + idx[111] = 063; idx[110] = 062; idx[109] = 055; idx[108] = 054; + idx[107] = 047; idx[106] = 046; idx[105] = 039; idx[104] = 038; + idx[103] = 031; idx[102] = 030; idx[101] = 023; idx[100] = 022; + idx[099] = 015; idx[098] = 014; idx[097] = 007; idx[096] = 006; + idx[095] = 123; idx[094] = 122; idx[093] = 115; idx[092] = 114; + idx[091] = 107; idx[090] = 106; idx[089] = 099; idx[088] = 098; + idx[087] = 091; idx[086] = 090; idx[085] = 083; idx[084] = 082; + idx[083] = 075; idx[082] = 074; idx[081] = 067; idx[080] = 066; + idx[079] = 059; idx[078] = 058; idx[077] = 051; idx[076] = 050; + idx[075] = 043; idx[074] = 042; idx[073] = 035; idx[072] = 034; + idx[071] = 027; idx[070] = 026; idx[069] = 019; idx[068] = 018; + idx[067] = 011; idx[066] = 010; idx[065] = 003; idx[064] = 002; + idx[063] = 125; idx[062] = 124; idx[061] = 117; idx[060] = 116; + idx[059] = 109; idx[058] = 108; idx[057] = 101; idx[056] = 100; + idx[055] = 093; idx[054] = 092; idx[053] = 085; idx[052] = 084; + idx[051] = 077; idx[050] = 076; idx[049] = 069; idx[048] = 068; + idx[047] = 061; idx[046] = 060; idx[045] = 053; idx[044] = 052; + idx[043] = 045; idx[042] = 044; idx[041] = 037; idx[040] = 036; + idx[039] = 029; idx[038] = 028; idx[037] = 021; idx[036] = 020; + idx[035] = 013; idx[034] = 012; idx[033] = 005; idx[032] = 004; + idx[031] = 121; idx[030] = 120; idx[029] = 113; idx[028] = 112; + idx[027] = 105; idx[026] = 104; idx[025] = 097; idx[024] = 096; + idx[023] = 089; idx[022] = 088; idx[021] = 081; idx[020] = 080; + idx[019] = 073; idx[018] = 072; idx[017] = 065; idx[016] = 064; + idx[015] = 057; idx[014] = 056; idx[013] = 049; idx[012] = 048; + idx[011] = 041; idx[010] = 040; idx[009] = 033; idx[008] = 032; + idx[007] = 025; idx[006] = 024; idx[005] = 017; idx[004] = 016; + idx[003] = 009; idx[002] = 008; idx[001] = 001; idx[000] = 000; + return idx[byte_idx[6:0]]; + end + rvv_pkg::EW8: begin + automatic vlen_t [127:0] idx; + idx[127] = 127; idx[126] = 119; idx[125] = 111; idx[124] = 103; + idx[123] = 095; idx[122] = 087; idx[121] = 079; idx[120] = 071; + idx[119] = 063; idx[118] = 055; idx[117] = 047; idx[116] = 039; + idx[115] = 031; idx[114] = 023; idx[113] = 015; idx[112] = 007; + idx[111] = 123; idx[110] = 115; idx[109] = 107; idx[108] = 099; + idx[107] = 091; idx[106] = 083; idx[105] = 075; idx[104] = 067; + idx[103] = 059; idx[102] = 051; idx[101] = 043; idx[100] = 035; + idx[099] = 027; idx[098] = 019; idx[097] = 011; idx[096] = 003; + idx[095] = 125; idx[094] = 117; idx[093] = 109; idx[092] = 101; + idx[091] = 093; idx[090] = 085; idx[089] = 077; idx[088] = 069; + idx[087] = 061; idx[086] = 053; idx[085] = 045; idx[084] = 037; + idx[083] = 029; idx[082] = 021; idx[081] = 013; idx[080] = 005; + idx[079] = 121; idx[078] = 113; idx[077] = 105; idx[076] = 097; + idx[075] = 089; idx[074] = 081; idx[073] = 073; idx[072] = 065; + idx[071] = 057; idx[070] = 049; idx[069] = 041; idx[068] = 033; + idx[067] = 025; idx[066] = 017; idx[065] = 009; idx[064] = 001; + idx[063] = 126; idx[062] = 118; idx[061] = 110; idx[060] = 102; + idx[059] = 094; idx[058] = 086; idx[057] = 078; idx[056] = 070; + idx[055] = 062; idx[054] = 054; idx[053] = 046; idx[052] = 038; + idx[051] = 030; idx[050] = 022; idx[049] = 014; idx[048] = 006; + idx[047] = 122; idx[046] = 114; idx[045] = 106; idx[044] = 098; + idx[043] = 090; idx[042] = 082; idx[041] = 074; idx[040] = 066; + idx[039] = 058; idx[038] = 050; idx[037] = 042; idx[036] = 034; + idx[035] = 026; idx[034] = 018; idx[033] = 010; idx[032] = 002; + idx[031] = 124; idx[030] = 116; idx[029] = 108; idx[028] = 100; + idx[027] = 092; idx[026] = 084; idx[025] = 076; idx[024] = 068; + idx[023] = 060; idx[022] = 052; idx[021] = 044; idx[020] = 036; + idx[019] = 028; idx[018] = 020; idx[017] = 012; idx[016] = 004; + idx[015] = 120; idx[014] = 112; idx[013] = 104; idx[012] = 096; + idx[011] = 088; idx[010] = 080; idx[009] = 072; idx[008] = 064; + idx[007] = 056; idx[006] = 048; idx[005] = 040; idx[004] = 032; + idx[003] = 024; idx[002] = 016; idx[001] = 008; idx[000] = 000; + return idx[byte_idx[6:0]]; + end + endcase endcase /*automatic vlen_t [8*MaxNrLanes-1:0] element_shuffle_index; From a9779cdd4d0a335f80098974853aa21300021230 Mon Sep 17 00:00:00 2001 From: Matheus Cavalcante Date: Thu, 8 Jul 2021 13:28:43 +0200 Subject: [PATCH 02/20] [apps] Avoid changing link.ld when changing Ara's configuration --- apps/.gitignore | 1 + apps/Makefile | 1 + apps/common/{link.ld => arch.link.ld} | 21 ++++++++++++--------- apps/common/script/align_sections.sh | 2 +- 4 files changed, 15 insertions(+), 10 deletions(-) rename apps/common/{link.ld => arch.link.ld} (63%) diff --git a/apps/.gitignore b/apps/.gitignore index ba077a403..ef412f9ba 100644 --- a/apps/.gitignore +++ b/apps/.gitignore @@ -1 +1,2 @@ bin +common/link.ld diff --git a/apps/Makefile b/apps/Makefile index 105da3d98..487053088 100644 --- a/apps/Makefile +++ b/apps/Makefile @@ -41,6 +41,7 @@ all: $(BINARIES) .PHONY: linker_script linker_script: $(COMMON_DIR)/script/align_sections.sh $(ROOT_DIR)/../../config/config.mk chmod +x $(COMMON_DIR)/script/align_sections.sh + rm -f $(COMMON_DIR)/link.ld && cp $(COMMON_DIR)/arch.link.ld $(COMMON_DIR)/link.ld $(COMMON_DIR)/script/align_sections.sh $(nr_lanes) $(COMMON_DIR)/link.ld # Make all applications diff --git a/apps/common/link.ld b/apps/common/arch.link.ld similarity index 63% rename from apps/common/link.ld rename to apps/common/arch.link.ld index 238238fba..efc1c8571 100644 --- a/apps/common/link.ld +++ b/apps/common/arch.link.ld @@ -1,3 +1,6 @@ +/* This file is used to generate link.ld, Ara's linker script, + which depends on the number of lanes of the current configuration */ + OUTPUT_ARCH("riscv") ENTRY(_start) @@ -15,38 +18,38 @@ SECTIONS { *(.text.*) } > L2 - .data : ALIGN(16) { + .data : ALIGN(ALIGNMENT) { *(.data) *(.data.*) } > L2 - .rodata : ALIGN(16) { *(.rodata .rodata.* .gnu.linkonce.r.*) } > L2 - .rodata1 : ALIGN(16) { *(.rodata1) } > L2 - .sdata2 : ALIGN(16) { + .rodata : ALIGN(ALIGNMENT) { *(.rodata .rodata.* .gnu.linkonce.r.*) } > L2 + .rodata1 : ALIGN(ALIGNMENT) { *(.rodata1) } > L2 + .sdata2 : ALIGN(ALIGNMENT) { *(.sdata2 .sdata2.* .gnu.linkonce.s2.*) } > L2 - .sdata : ALIGN(16) { + .sdata : ALIGN(ALIGNMENT) { __global_pointer$ = . + 0x800; *(.srodata.cst16) *(.srodata.cst8) *(.srodata.cst4) *(.srodata.cst2) *(.srodata .srodata.*) *(.sdata .sdata.* .gnu.linkonce.s.*) } > L2 - .bss : ALIGN(16) { + .bss : ALIGN(ALIGNMENT) { __bss_start = .; *(.bss) *(.sbss .sbss.* .sbss2 .sbss2.* .gnu.linkonce.sb2.*); __bss_end = .; } > L2 - .l2 : ALIGN(16) { + .l2 : ALIGN(ALIGNMENT) { *(.l2) timer = .; . = . + 0x8; - l2_alloc_base = ALIGN(16); + l2_alloc_base = ALIGN(ALIGNMENT); } > L2 - .comment : ALIGN(16) { *(.comment) } > L2 + .comment : ALIGN(ALIGNMENT) { *(.comment) } > L2 eoc_address_reg = 0xD0000000; dram_start_address_reg = 0xD0000008; diff --git a/apps/common/script/align_sections.sh b/apps/common/script/align_sections.sh index 876ad5a44..0488f7316 100755 --- a/apps/common/script/align_sections.sh +++ b/apps/common/script/align_sections.sh @@ -4,4 +4,4 @@ # Align the sections by AxiWideBeWidth # NB: this script modify ALL the ALIGN directives let ALIGNMENT=4*$1; -sed -i "s/ALIGN([0-9]*)/ALIGN($ALIGNMENT)/g" $2 +sed -i "s/ALIGNMENT/$ALIGNMENT/g" $2 From fdcc86879d0b0364d3c17684ee62dc9b2b9a0ba0 Mon Sep 17 00:00:00 2001 From: Matheus Cavalcante Date: Thu, 8 Jul 2021 13:43:40 +0200 Subject: [PATCH 03/20] [config] Add more Ara configurations --- README.md | 4 +++- apps/Makefile | 2 +- apps/common/runtime.mk | 12 +++++++++++- config/16_lanes.mk | 25 +++++++++++++++++++++++++ config/2_lanes.mk | 25 +++++++++++++++++++++++++ config/{config.mk => 4_lanes.mk} | 0 config/8_lanes.mk | 25 +++++++++++++++++++++++++ config/README.md | 23 +++++++++++++++++++---- config/default.mk | 1 + hardware/Makefile | 30 ++++++++++++++++++++---------- 10 files changed, 130 insertions(+), 17 deletions(-) create mode 100644 config/16_lanes.mk create mode 100644 config/2_lanes.mk rename config/{config.mk => 4_lanes.mk} (100%) create mode 100644 config/8_lanes.mk create mode 120000 config/default.mk diff --git a/README.md b/README.md index b041d3855..547b79866 100644 --- a/README.md +++ b/README.md @@ -60,9 +60,11 @@ make verilator ## Configuration -Ara's parameters are centralized in the `config` folder, in the `config.mk` file. +Ara's parameters are centralized in the `config` folder, which provides several configurations to the vector machine. Please check `config/README.md` for more details. +Prepend `config=chosen_ara_configuration` to your Makefile commands, or export the `ARA_CONFIGURATION` variable, to chose a configuration other than the `default` one. + ## Software ### Build Applications diff --git a/apps/Makefile b/apps/Makefile index 487053088..125c9ef16 100644 --- a/apps/Makefile +++ b/apps/Makefile @@ -39,7 +39,7 @@ all: $(BINARIES) # Pre-process the linker-script to correclty align the sections .PHONY: linker_script -linker_script: $(COMMON_DIR)/script/align_sections.sh $(ROOT_DIR)/../../config/config.mk +linker_script: $(COMMON_DIR)/script/align_sections.sh $(ROOT_DIR)/../../config/$(config).mk chmod +x $(COMMON_DIR)/script/align_sections.sh rm -f $(COMMON_DIR)/link.ld && cp $(COMMON_DIR)/arch.link.ld $(COMMON_DIR)/link.ld $(COMMON_DIR)/script/align_sections.sh $(nr_lanes) $(COMMON_DIR)/link.ld diff --git a/apps/common/runtime.mk b/apps/common/runtime.mk index 39f0082f5..e331059ba 100644 --- a/apps/common/runtime.mk +++ b/apps/common/runtime.mk @@ -20,8 +20,18 @@ SHELL = /usr/bin/env bash ROOT_DIR := $(patsubst %/,%, $(dir $(abspath $(lastword $(MAKEFILE_LIST))))) ARA_DIR := $(shell git rev-parse --show-toplevel 2>/dev/null || echo $$ARA_DIR) + +# Choose Ara's configuration +ifndef config + ifdef ARA_CONFIGURATION + config := $(ARA_CONFIGURATION) + else + config := default + endif +endif + # Include configuration -include $(ARA_DIR)/config/config.mk +include $(ARA_DIR)/config/$(config).mk INSTALL_DIR ?= $(ARA_DIR)/install GCC_INSTALL_DIR ?= $(INSTALL_DIR)/riscv-gcc diff --git a/config/16_lanes.mk b/config/16_lanes.mk new file mode 100644 index 000000000..e9ebcc9e4 --- /dev/null +++ b/config/16_lanes.mk @@ -0,0 +1,25 @@ +# Copyright 2020 ETH Zurich and University of Bologna. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Samuel Riedel, ETH Zurich +# Matheus Cavalcante, ETH Zurich + +# Number of vector lanes +nr_lanes ?= 16 + +# Length of each vector register (in bits) +# Constraints: VLEN > 128 +vlen ?= 4096 diff --git a/config/2_lanes.mk b/config/2_lanes.mk new file mode 100644 index 000000000..013fb7f23 --- /dev/null +++ b/config/2_lanes.mk @@ -0,0 +1,25 @@ +# Copyright 2020 ETH Zurich and University of Bologna. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Samuel Riedel, ETH Zurich +# Matheus Cavalcante, ETH Zurich + +# Number of vector lanes +nr_lanes ?= 2 + +# Length of each vector register (in bits) +# Constraints: VLEN > 128 +vlen ?= 4096 diff --git a/config/config.mk b/config/4_lanes.mk similarity index 100% rename from config/config.mk rename to config/4_lanes.mk diff --git a/config/8_lanes.mk b/config/8_lanes.mk new file mode 100644 index 000000000..53792ec69 --- /dev/null +++ b/config/8_lanes.mk @@ -0,0 +1,25 @@ +# Copyright 2020 ETH Zurich and University of Bologna. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Samuel Riedel, ETH Zurich +# Matheus Cavalcante, ETH Zurich + +# Number of vector lanes +nr_lanes ?= 8 + +# Length of each vector register (in bits) +# Constraints: VLEN > 128 +vlen ?= 4096 diff --git a/config/README.md b/config/README.md index 94e5edb5e..18e2e9b3b 100644 --- a/config/README.md +++ b/config/README.md @@ -2,20 +2,35 @@ This file is included by all *Makefiles* in the Ara project to have a common source for all configurations. Please only edit this file to change some -parameters such as the number of cores in the design. This will automatically +parameters such as the number of lanes in the design. This will automatically generate the correct software runtime and the correct hardware. +Ara currently has four configurations, which differ on the amount of lanes: +- `2_lanes.mk` +- `4_lanes.mk` +- `8_lanes.mk` +- `16_lanes.mk` +We also provide a `default.mk` configuration, which links to the `4_lanes` one. + +When running Ara's Makefiles, prepend `config=configuration_without_mk` to choose +a configuration. Alternatively, export the `ARA_CONFIG` variable. Please note that +the configuration chosen via the `config=` command line has priority over the +configuration set globally through the `ARA_CONFIG` variable. + +If no configuration is explicitly chosen, Ara will use the `default` one. Please run +`make clean` after changing configurations. + To avoid constantly having a dirty git environment when working with a configuration that differs from the default one, you can ignore changes to the configuration file with the following command: ```bash -git update-index --assume-unchanged config/config.mk +git update-index --assume-unchanged config/default.mk ``` -In case you want to change the default and commit your changes to `config.mk`, +In case you want to change the default and commit your changes to `default.mk`, you can use the following command to make git pick up tracking the file again: ```bash -git update-index --no-assume-unchanged config/config.mk +git update-index --no-assume-unchanged config/default.mk ``` diff --git a/config/default.mk b/config/default.mk new file mode 120000 index 000000000..284cbccb3 --- /dev/null +++ b/config/default.mk @@ -0,0 +1 @@ +4_lanes.mk \ No newline at end of file diff --git a/hardware/Makefile b/hardware/Makefile index c13145176..fae523d06 100644 --- a/hardware/Makefile +++ b/hardware/Makefile @@ -10,8 +10,18 @@ ARA_DIR := $(shell git rev-parse --show-toplevel 2>/dev/null || echo $$ARA_DIR) INSTALL_DIR := $(abspath $(ROOT_DIR)/../install) VERILATOR_INCLUDE := $(INSTALL_DIR)/verilator/share/verilator/include/vltstd +# Choose Ara's configuration +ifndef config + ifdef ARA_CONFIGURATION + config := $(ARA_CONFIGURATION) + else + config := default + endif +endif + # Include configuration -include $(abspath $(ROOT_DIR)/../config/config.mk) +config_file := $(ROOT_DIR)/../config/$(config).mk +include $(abspath $(ROOT_DIR)/../config/$(config).mk) # build path buildpath ?= build @@ -89,13 +99,13 @@ $(buildpath)/$(library): # Compilation .PHONY: compile -compile: dpi lib $(buildpath) bender $(buildpath)/compile.tcl -$(buildpath)/compile.tcl: Makefile ../Bender.yml $(shell find src -type f) $(shell find ../config -type f) $(shell find include -type f) $(shell find tb -type f) $(shell find deps -type f) - ./bender script vsim --vlog-arg="$(vlog_args)" -t rtl -t asic -t ara_test -t cva6_test $(bender_defs) > $(buildpath)/compile.tcl - echo "exit" >> $(buildpath)/compile.tcl - cd $(buildpath) && $(questa_cmd) vsim -work $(library) -c -do compile.tcl +compile: dpi lib $(buildpath) bender $(buildpath)/compile_$(config).tcl +$(buildpath)/compile_$(config).tcl: $(config_file) Makefile ../Bender.yml $(shell find src -type f) $(shell find ../config -type f) $(shell find include -type f) $(shell find tb -type f) $(shell find deps -type f) + ./bender script vsim --vlog-arg="$(vlog_args)" -t rtl -t asic -t ara_test -t cva6_test $(bender_defs) > $(buildpath)/compile_$(config).tcl + echo "exit" >> $(buildpath)/compile_$(config).tcl + cd $(buildpath) && $(questa_cmd) vsim -work $(library) -c -do compile_$(config).tcl # Remove the file if compilation did not succeed - if [ `cat $(buildpath)/transcript | grep "\*\* Error" | wc -l` -ne 0 ]; then rm $(buildpath)/compile.tcl; fi + if [ `cat $(buildpath)/transcript | grep "\*\* Error" | wc -l` -ne 0 ]; then rm $(buildpath)/compile_$(config).tcl; fi # Simulation .PHONY: sim @@ -121,11 +131,11 @@ tests := $(ara_tests) $(cva6_tests) .PHONY: verilate verilate: $(buildpath) bender $(veril_library)/V$(veril_top) -$(veril_library)/V$(veril_top): Makefile ../Bender.yml $(shell find src -type f) $(shell find ../config -type f) $(shell find include -type f) $(shell find tb -type f) $(shell find deps -type f) +$(veril_library)/V$(veril_top): $(config_file) Makefile ../Bender.yml $(shell find src -type f) $(shell find ../config -type f) $(shell find include -type f) $(shell find tb -type f) $(shell find deps -type f) rm -rf $(veril_library); mkdir -p $(veril_library) - ./bender script verilator -t rtl -t ara_test -t cva6_test -t verilator $(bender_defs) > $(veril_library)/bender_script + ./bender script verilator -t rtl -t ara_test -t cva6_test -t verilator $(bender_defs) > $(veril_library)/bender_script_$(config) # Verilate the design - $(veril_path)/verilator -f $(veril_library)/bender_script \ + $(veril_path)/verilator -f $(veril_library)/bender_script_$(config) \ -GNrLanes=$(nr_lanes) \ -O3 \ -Wno-BLKANDNBLK \ From 93f6fb48e0297adaaa2d7ab19e9e4020fb3467c7 Mon Sep 17 00:00:00 2001 From: Matheus Cavalcante Date: Thu, 8 Jul 2021 14:24:07 +0200 Subject: [PATCH 04/20] [ci] Test all Ara configurations on the CI --- .github/workflows/ci.yml | 86 +++++++++++++++++++++++++++++----------- 1 file changed, 62 insertions(+), 24 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c6ec6ea97..76dae31e5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -169,6 +169,9 @@ jobs: compile-apps: runs-on: ubuntu-latest + strategy: + matrix: + ara_config: [2_lanes, 4_lanes, 8_lanes, 16_lanes] needs: tc-llvm steps: - uses: actions/checkout@v2 @@ -183,15 +186,18 @@ jobs: - name: Untar LLVM run: tar xvf tc-llvm.tar - name: Compile applications - run: make -C apps + run: config=${{ matrix.ara_config }} make -C apps - name: Upload applications uses: actions/upload-artifact@v2 with: - name: compile-apps + name: compile-apps-${{ matrix.ara_config }} path: apps/bin compile-riscv-tests: runs-on: ubuntu-latest + strategy: + matrix: + ara_config: [2_lanes, 4_lanes, 8_lanes, 16_lanes] needs: ["tc-llvm", "tc-gcc", "tc-isa-sim"] steps: - uses: actions/checkout@v2 @@ -218,15 +224,18 @@ jobs: - name: Untar GCC run: tar xvf tc-gcc.tar - name: Compile applications - run: make -C apps riscv_tests + run: config=${{ matrix.ara_config }} make -C apps riscv_tests - name: Upload applications uses: actions/upload-artifact@v2 with: - name: compile-riscv-tests + name: compile-riscv-tests-${{ matrix.ara_config }} path: apps/bin compile-ara: runs-on: ubuntu-latest + strategy: + matrix: + ara_config: [2_lanes, 4_lanes, 8_lanes, 16_lanes] needs: ["tc-verilator", "tc-isa-sim"] steps: - uses: actions/checkout@v2 @@ -262,13 +271,13 @@ jobs: run: | sudo apt-get install libelf-dev make -C hardware apply-patches - make -C hardware verilate + config=${{ matrix.ara_config }} make -C hardware verilate - name: Tar Verilated model of Ara run: tar -cvf ara.tar hardware/build/verilator hardware/bender - name: Upload Ara Verilated model uses: actions/upload-artifact@v2 with: - name: compile-ara + name: compile-ara-${{ matrix.ara_config }} path: ara.tar #################### @@ -277,6 +286,9 @@ jobs: simulate-hello-world: runs-on: ubuntu-latest + strategy: + matrix: + ara_config: [2_lanes, 4_lanes, 8_lanes, 16_lanes] needs: ["compile-ara", "compile-apps"] steps: - uses: actions/checkout@v2 @@ -289,19 +301,22 @@ jobs: - name: Get Verilated model of Ara uses: actions/download-artifact@v2 with: - name: compile-ara + name: compile-ara-${{ matrix.ara_config }} - name: Untar Verilated model of Ara run: tar xvf ara.tar - name: Get applications uses: actions/download-artifact@v2 with: - name: compile-apps + name: compile-apps-${{ matrix.ara_config }} path: apps/bin - name: Run test - run: make -C hardware app=hello_world simv + run: config=${{ matrix.ara_config }} make -C hardware app=hello_world simv simulate-imatmul: runs-on: ubuntu-latest + strategy: + matrix: + ara_config: [2_lanes, 4_lanes, 8_lanes, 16_lanes] needs: ["compile-ara", "compile-apps"] steps: - uses: actions/checkout@v2 @@ -314,19 +329,22 @@ jobs: - name: Get Verilated model of Ara uses: actions/download-artifact@v2 with: - name: compile-ara + name: compile-ara-${{ matrix.ara_config }} - name: Untar Verilated model of Ara run: tar xvf ara.tar - name: Get applications uses: actions/download-artifact@v2 with: - name: compile-apps + name: compile-apps-${{ matrix.ara_config }} path: apps/bin - name: Run test - run: make -C hardware app=imatmul simv + run: config=${{ matrix.ara_config }} make -C hardware app=imatmul simv simulate-fmatmul: runs-on: ubuntu-latest + strategy: + matrix: + ara_config: [2_lanes, 4_lanes, 8_lanes, 16_lanes] needs: ["compile-ara", "compile-apps"] steps: - uses: actions/checkout@v2 @@ -339,19 +357,22 @@ jobs: - name: Get Verilated model of Ara uses: actions/download-artifact@v2 with: - name: compile-ara + name: compile-ara-${{ matrix.ara_config }} - name: Untar Verilated model of Ara run: tar xvf ara.tar - name: Get applications uses: actions/download-artifact@v2 with: - name: compile-apps + name: compile-apps-${{ matrix.ara_config }} path: apps/bin - name: Run test - run: make -C hardware app=fmatmul simv + run: config=${{ matrix.ara_config }} make -C hardware app=fmatmul simv simulate-conv2d: runs-on: ubuntu-latest + strategy: + matrix: + ara_config: [2_lanes, 4_lanes, 8_lanes, 16_lanes] needs: ["compile-ara", "compile-apps"] steps: - uses: actions/checkout@v2 @@ -364,16 +385,16 @@ jobs: - name: Get Verilated model of Ara uses: actions/download-artifact@v2 with: - name: compile-ara + name: compile-ara-${{ matrix.ara_config }} - name: Untar Verilated model of Ara run: tar xvf ara.tar - name: Get applications uses: actions/download-artifact@v2 with: - name: compile-apps + name: compile-apps-${{ matrix.ara_config }} path: apps/bin - name: Run test - run: make -C hardware app=conv2d simv + run: config=${{ matrix.ara_config }} make -C hardware app=conv2d simv ######################## # RISC-V Tests stage # @@ -381,6 +402,9 @@ jobs: riscv-tests-simv: runs-on: ubuntu-latest + strategy: + matrix: + ara_config: [2_lanes, 4_lanes, 8_lanes, 16_lanes] needs: ["compile-ara", "compile-riscv-tests"] steps: - uses: actions/checkout@v2 @@ -393,16 +417,16 @@ jobs: - name: Get Verilated model of Ara uses: actions/download-artifact@v2 with: - name: compile-ara + name: compile-ara-${{ matrix.ara_config }} - name: Untar Verilated model of Ara run: tar xvf ara.tar - name: Get RISC-V tests uses: actions/download-artifact@v2 with: - name: compile-riscv-tests + name: compile-riscv-tests-${{ matrix.ara_config }} path: apps/bin - name: Run tests - run: make -C hardware -j4 riscv_tests_simv + run: config=${{ matrix.ara_config }} make -C hardware -j4 riscv_tests_simv riscv-tests-spike: runs-on: ubuntu-latest @@ -518,7 +542,21 @@ jobs: tc-gcc tc-isa-sim tc-verilator - compile-ara - compile-apps - compile-riscv-tests riscv-tests-spike + + clean-up-compile-runs: + runs-on: ubuntu-latest + strategy: + matrix: + ara_config: [2_lanes, 4_lanes, 8_lanes, 16_lanes] + if: always() + needs: ["simulate-hello-world", "simulate-imatmul", "simulate-fmatmul", "simulate-conv2d", "riscv-tests-spike", "riscv-tests-simv"] + steps: + - uses: actions/checkout@v2 + - name: Delete artifacts + uses: geekyeggo/delete-artifact@v1 + with: + name: | + compile-ara-${{ matrix.ara_config }} + compile-apps-${{ matrix.ara_config }} + compile-riscv-tests-${{ matrix.ara_config }} From f43d7c8609357b04e077e4be50a251befddc631d Mon Sep 17 00:00:00 2001 From: Matheus Cavalcante Date: Thu, 8 Jul 2021 18:32:41 +0200 Subject: [PATCH 05/20] [apps] Align conv2d data on 16-byte lines --- apps/conv2d/data.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/conv2d/data.S b/apps/conv2d/data.S index 250c90251..824e3001a 100644 --- a/apps/conv2d/data.S +++ b/apps/conv2d/data.S @@ -15,7 +15,7 @@ F: .word 0x00000003 .word 0x00000000 .global i -.align NR_LANES*4 +.align 16 i: .word 0x0000229a .word 0x00000000 @@ -8730,7 +8730,7 @@ i: .word 0x000007bb .word 0x00000000 .global f -.align NR_LANES*4 +.align 16 f: .word 0x00000000 .word 0x00000000 @@ -8751,7 +8751,7 @@ f: .word 0x00000009 .word 0x00000000 .global o -.align NR_LANES*4 +.align 16 o: .word 0x00000000 .word 0x00000000 @@ -16946,7 +16946,7 @@ o: .word 0x00000000 .word 0x00000000 .global golden_o -.align NR_LANES*4 +.align 16 golden_o: .word 0x000bf2d0 .word 0x00000000 From bd4b1aa131b10aa7ac328981beec398d2d6fd810 Mon Sep 17 00:00:00 2001 From: Matheus Cavalcante Date: Thu, 8 Jul 2021 18:48:52 +0200 Subject: [PATCH 06/20] [hardware] :bug: Calculate `vstu`'s vl based on the current `SEW` --- hardware/src/vlsu/vstu.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hardware/src/vlsu/vstu.sv b/hardware/src/vlsu/vstu.sv index 8dd839c49..03f594244 100644 --- a/hardware/src/vlsu/vstu.sv +++ b/hardware/src/vlsu/vstu.sv @@ -302,7 +302,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( // Initialize counters if (vinsn_queue_d.issue_cnt == '0) - issue_cnt_d = pe_req_i.vl << int'(pe_req_i.eew_vs1); + issue_cnt_d = pe_req_i.vl << int'(pe_req_i.vtype.vsew); // Bump pointers and counters of the vector instruction queue vinsn_queue_d.accept_pnt += 1; From 4e5a8bba0d3544ed4efac0d89fdcb959f027a1bb Mon Sep 17 00:00:00 2001 From: Matheus Cavalcante Date: Fri, 9 Jul 2021 11:09:36 +0200 Subject: [PATCH 07/20] [hardware] :bug: Fix `vslideup` vector length trimming --- hardware/src/lane/lane_sequencer.sv | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv index 02bf5718f..f3bb4f302 100644 --- a/hardware/src/lane/lane_sequencer.sv +++ b/hardware/src/lane/lane_sequencer.sv @@ -377,14 +377,10 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: VSLIDEUP: begin // We need to trim full words from the end of the vector that are not used // as operands by the slide unit. - automatic vlen_t vslideup_adj = pe_req_i.stride >> ($clog2(NrLanes) + int'(EW64) - int'(pe_req_i.eew_vs2)); - // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - operand_request_i[SlideAddrGenA].vl = pe_req_i.vl / NrLanes - vslideup_adj; - - if ((operand_request_i[SlideAddrGenA].vl + vslideup_adj) * NrLanes != pe_req_i.vl) - operand_request_i[SlideAddrGenA].vl += 1; + operand_request_i[SlideAddrGenA].vl = + (pe_req_i.vl - pe_req_i.stride + NrLanes - 1) / NrLanes; end VSLIDEDOWN: begin // We need to trim full words from the start of the vector that are not used @@ -421,13 +417,14 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: VSLIDEUP: begin // We need to trim full words from the end of the vector that are not used // as operands by the slide unit. - automatic vlen_t vslideup_adj = (pe_req_i.stride / NrLanes / 8) >> (int'(EW64) - int'(pe_req_i.vtype.vsew)); - // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - operand_request_i[MaskM].vl = ((pe_req_i.vl / NrLanes / 8) >> (int'(EW64) - int'(pe_req_i.vtype.vsew))) - vslideup_adj; + operand_request_i[MaskM].vl = + ((pe_req_i.vl - pe_req_i.stride + NrLanes - 1) / 8 / NrLanes) + >> (int'(EW64) - int'(pe_req_i.vtype.vsew)); - if (((operand_request_i[MaskM].vl + vslideup_adj) << (int'(EW64) - int'(pe_req_i.vtype.vsew))) * NrLanes * 8 != pe_req_i.vl) + if (((operand_request_i[MaskM].vl + pe_req_i.stride) << + (int'(EW64) - int'(pe_req_i.vtype.vsew)) * NrLanes * 8 != pe_req_i.vl)) operand_request_i[MaskM].vl += 1; end VSLIDEDOWN: begin From 6634e03e3d0455565cee20756a11c5779aa4ca05 Mon Sep 17 00:00:00 2001 From: Matheus Cavalcante Date: Fri, 9 Jul 2021 13:50:53 +0200 Subject: [PATCH 08/20] [hardware] :bug: Fix `vslidedown` vector length trimming --- hardware/src/lane/lane_sequencer.sv | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv index f3bb4f302..622de2bfb 100644 --- a/hardware/src/lane/lane_sequencer.sv +++ b/hardware/src/lane/lane_sequencer.sv @@ -385,8 +385,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: VSLIDEDOWN: begin // We need to trim full words from the start of the vector that are not used // as operands by the slide unit. - automatic vlen_t vslidedown_adj = pe_req_i.stride >> ($clog2(NrLanes) + int'(EW64) - int'(pe_req_i.eew_vs2)); - operand_request_i[SlideAddrGenA].vstart = vslidedown_adj; + operand_request_i[SlideAddrGenA].vstart = pe_req_i.stride / NrLanes; // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. @@ -394,9 +393,11 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: if (operand_request_i[SlideAddrGenA].vl * NrLanes != pe_req_i.vl) operand_request_i[SlideAddrGenA].vl += 1; - // If the vslidedown stride is not a full VRF word, we will need to request an extra word + // If the vslidedown stride is not a full VRF word, we will need to request an extra + // word if (!pe_req_i.use_scalar_op) - if (pe_req_i.stride - (vslidedown_adj << ($clog2(NrLanes) + int'(EW64) - int'(pe_req_i.eew_vs2))) != 0) + if ((pe_req_i.stride & + ((vlen_t'(1) << ($clog2(NrLanes) + int'(EW64 - pe_req_i.eew_vs2))) - 1)) != 0) operand_request_i[SlideAddrGenA].vl += 1; end endcase From e6edeb2ba53e2d045d3b642fb2fc4c12e1c46af4 Mon Sep 17 00:00:00 2001 From: Matheus Cavalcante Date: Tue, 13 Jul 2021 18:58:50 +0200 Subject: [PATCH 09/20] [hardware] :bug: Mute mask requests of idle lanes --- hardware/src/masku/masku.sv | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/hardware/src/masku/masku.sv b/hardware/src/masku/masku.sv index 1df4c8c34..fd5323cd6 100644 --- a/hardware/src/masku/masku.sv +++ b/hardware/src/masku/masku.sv @@ -461,14 +461,19 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( else mask_queue_write_pnt_d = mask_queue_write_pnt_q + 1; - // Trigger the request signal - mask_queue_valid_d[mask_queue_write_pnt_q] = {NrLanes{1'b1}}; - // Account for the operands that were issued read_cnt_d = read_cnt_q - NrLanes * (1 << (int'(EW64) - vinsn_issue.vtype.vsew)); if (read_cnt_q < NrLanes * (1 << (int'(EW64) - vinsn_issue.vtype.vsew))) read_cnt_d = '0; + // Trigger the request signal + mask_queue_valid_d[mask_queue_write_pnt_q] = {NrLanes{1'b1}}; + + // Are there lanes with no valid elements? + // If so, mute their request signal + if (read_cnt_q < NrLanes) + mask_queue_valid_d[mask_queue_write_pnt_q] = (1 << read_cnt_q) - 1; + // Consumed all valid bytes from the lane operands if (mask_pnt_d == NrLanes*64 || read_cnt_d == '0) begin // Request another beat From 3e19cc23d3f918a538bdc13f5e65baa3e818a1de Mon Sep 17 00:00:00 2001 From: Matheus Cavalcante Date: Tue, 13 Jul 2021 21:26:42 +0200 Subject: [PATCH 10/20] [verilator] Update Verilator to version v4.210 --- .github/workflows/ci.yml | 9 --------- Makefile | 16 ++++++++++++++-- toolchain/verilator | 2 +- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 76dae31e5..a0252d053 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -145,11 +145,6 @@ jobs: if: steps.tc-verilator-cache.outputs.cache-hit != 'true' run: | git submodule update --init --recursive -- toolchain/verilator - - name: Install LLVM and Clang - if: steps.tc-verilator-cache.outputs.cache-hit != 'true' - uses: KyleMayes/install-llvm-action@v1 - with: - version: "10.0" - name: Compile Verilator if: steps.tc-verilator-cache.outputs.cache-hit != 'true' run: | @@ -263,10 +258,6 @@ jobs: ln -s $VERILATOR_ROOT/share/verilator/bin/verilator_includer $VERILATOR_ROOT/bin/verilator_includer - name: Download RTL submodules run: git submodule update --init --recursive hardware - - name: Install LLVM and Clang - uses: KyleMayes/install-llvm-action@v1 - with: - version: "10.0" - name: Compile Verilated model of Ara run: | sudo apt-get install libelf-dev diff --git a/Makefile b/Makefile index d5d661371..a3bb7753d 100644 --- a/Makefile +++ b/Makefile @@ -24,7 +24,7 @@ GCC_INSTALL_DIR ?= ${INSTALL_DIR}/riscv-gcc LLVM_INSTALL_DIR ?= ${INSTALL_DIR}/riscv-llvm ISA_SIM_INSTALL_DIR ?= ${INSTALL_DIR}/riscv-isa-sim VERIL_INSTALL_DIR ?= ${INSTALL_DIR}/verilator -VERIL_VERSION ?= v4.106 +VERIL_VERSION ?= v4.210 CMAKE ?= cmake @@ -37,6 +37,17 @@ ifeq ($(origin CXX),default) CXX = g++ endif +# We need a recent LLVM to compile Verilator +CLANG_CC ?= clang +CLANG_CXX ?= clang++ +ifneq (${CLANG_PATH},) + CLANG_CXXFLAGS := "-nostdinc++ -isystem $(CLANG_PATH)/include/c++/v1" + CLANG_LDFLAGS := "-L $(CLANG_PATH)/lib -Wl,-rpath,$(CLANG_PATH)/lib -lc++ -nostdlib++" +else + CLANG_CXXFLAGS := "" + CLANG_LDFLAGS := "" +endif + # Default target all: toolchains riscv-isa-sim verilator @@ -131,7 +142,8 @@ ${VERIL_INSTALL_DIR}: Makefile cd $(CURDIR)/toolchain/verilator && git reset --hard && git fetch && git checkout ${VERIL_VERSION} # Compile verilator cd $(CURDIR)/toolchain/verilator && git clean -xfdf && autoconf && \ - CC=clang CXX=clang++ ./configure --prefix=$(VERIL_INSTALL_DIR) && make -j4 && make install + CC=$(CLANG_CC) CXX=$(CLANG_CXX) CXXFLAGS=$(CLANG_CXXFLAGS) LDFLAGS=$(CLANG_LDFLAGS) \ + ./configure --prefix=$(VERIL_INSTALL_DIR) && make -j8 && make install # RISC-V Tests riscv_tests: diff --git a/toolchain/verilator b/toolchain/verilator index b350b6a0f..8e2ba6a00 160000 --- a/toolchain/verilator +++ b/toolchain/verilator @@ -1 +1 @@ -Subproject commit b350b6a0ffdebe47c1681ec6a868daf47b7a1f9a +Subproject commit 8e2ba6a00382075387b32fbbf8f5f85fec482d9a From 97db4360b5abc5ab042c021743fd3656015541e1 Mon Sep 17 00:00:00 2001 From: Matheus Cavalcante Date: Tue, 13 Jul 2021 23:07:55 +0200 Subject: [PATCH 11/20] [hardware] Implement hierarchical verilation --- Bender.yml | 3 +++ hardware/Makefile | 21 ++++++++++++++++----- hardware/include/ara/ara.svh | 28 ++++++++++++++++++++++++++++ hardware/src/lane/lane.sv | 12 +++++++----- hardware/tb/verilator/waiver.vlt | 21 +++++++++++++++++++++ 5 files changed, 75 insertions(+), 10 deletions(-) create mode 100644 hardware/include/ara/ara.svh create mode 100644 hardware/tb/verilator/waiver.vlt diff --git a/Bender.yml b/Bender.yml index 4a5b70cec..13e608811 100644 --- a/Bender.yml +++ b/Bender.yml @@ -16,6 +16,9 @@ dependencies: workspace: checkout_dir: "hardware/deps" +export_include_dirs: + - hardware/include + sources: files: # Headers diff --git a/hardware/Makefile b/hardware/Makefile index fae523d06..7a4c6661d 100644 --- a/hardware/Makefile +++ b/hardware/Makefile @@ -23,6 +23,15 @@ endif config_file := $(ROOT_DIR)/../config/$(config).mk include $(abspath $(ROOT_DIR)/../config/$(config).mk) +# Clang flags for Verilator command +ifneq (${CLANG_PATH},) + CLANG_CXXFLAGS := -CFLAGS "-nostdinc++ -isystem $(CLANG_PATH)/include/c++/v1" + CLANG_LDFLAGS := -LDFLAGS "-L $(CLANG_PATH)/lib -Wl,-rpath,$(CLANG_PATH)/lib -lc++ -nostdlib++" +else + CLANG_CXXFLAGS := "" + CLANG_LDFLAGS := "" +endif + # build path buildpath ?= build resultpath ?= results @@ -141,24 +150,26 @@ $(veril_library)/V$(veril_top): $(config_file) Makefile ../Bender.yml $(shell fi -Wno-BLKANDNBLK \ -Wno-CASEINCOMPLETE \ -Wno-CMPCONST \ + -Wno-LATCH \ -Wno-LITENDIAN \ - -Wno-MODDUP \ - -Wno-PINMISSING \ - -Wno-SYMRSVDWORD \ -Wno-UNOPTFLAT \ -Wno-UNPACKED \ -Wno-UNSIGNED \ -Wno-WIDTH \ -Wno-WIDTHCONCAT \ - --Mdir $(veril_library) --trace \ + --hierarchical \ + tb/verilator/waiver.vlt \ + --Mdir $(veril_library) \ -Itb/dpi \ --compiler clang \ - -CFLAGS "-std=c++11 -Wall -DTOPLEVEL_NAME=$(veril_top)" \ + -CFLAGS "-DTOPLEVEL_NAME=$(veril_top)" \ -CFLAGS "-DNR_LANES=$(nr_lanes)" \ -CFLAGS -I$(ROOT_DIR)/tb/verilator/lowrisc_dv_verilator_memutil_dpi/cpp \ -CFLAGS -I$(ROOT_DIR)/tb/verilator/lowrisc_dv_verilator_memutil_verilator/cpp \ -CFLAGS -I$(ROOT_DIR)/tb/verilator/lowrisc_dv_verilator_simutil_verilator/cpp \ + $(CLANG_CXXFLAGS) \ -LDFLAGS "-lelf" \ + $(CLANG_LDFLAGS) \ --exe \ $(ROOT_DIR)/tb/verilator/lowrisc_dv_verilator_memutil_dpi/cpp/*.cc \ $(ROOT_DIR)/tb/verilator/lowrisc_dv_verilator_memutil_verilator/cpp/*.cc \ diff --git a/hardware/include/ara/ara.svh b/hardware/include/ara/ara.svh new file mode 100644 index 000000000..612d72f07 --- /dev/null +++ b/hardware/include/ara/ara.svh @@ -0,0 +1,28 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Macros used throughout the Ara project + +`ifndef ARA_SVH_ +`define ARA_SVH_ + + // Structs in ports of hierarchical modules are not supported in Verilator + // --> Flatten them for Verilator + `define STRUCT_PORT(struct_t) \ + `ifndef VERILATOR \ + struct_t \ + `else \ + logic[$bits(struct_t)-1:0] \ + `endif + + // Create a flattened vector of a struct. Make sure the first dimension is + // the dimension into the vector of struct types and not the struct itself. + `define STRUCT_VECT(struct_t, dim) \ + `ifndef VERILATOR \ + struct_t dim \ + `else \ + logic dim [$bits(struct_t)-1:0] \ + `endif + +`endif diff --git a/hardware/src/lane/lane.sv b/hardware/src/lane/lane.sv index 254191065..23c1b7d2a 100644 --- a/hardware/src/lane/lane.sv +++ b/hardware/src/lane/lane.sv @@ -7,6 +7,8 @@ // This is one of Ara's lanes. It contains part of the vector register file // together with the execution units. +`include "ara/ara.svh" + module lane import ara_pkg::*; import rvv_pkg::*; #( parameter int unsigned NrLanes = 1, // Number of lanes // Support for floating-point data types @@ -34,10 +36,10 @@ module lane import ara_pkg::*; import rvv_pkg::*; #( output logic [4:0] fflags_ex_o, output logic fflags_ex_valid_o, // Interface with the sequencer - input pe_req_t pe_req_i, + input `STRUCT_PORT(pe_req_t) pe_req_i, input logic pe_req_valid_i, output logic pe_req_ready_o, - output pe_resp_t pe_resp_o, + output `STRUCT_PORT(pe_resp_t) pe_resp_o, // Interface with the Store unit output elen_t stu_operand_o, output logic stu_operand_valid_o, @@ -62,9 +64,9 @@ module lane import ara_pkg::*; import rvv_pkg::*; #( input strb_t ldu_result_be_i, output logic ldu_result_gnt_o, // Interface with the Mask unit - output elen_t [2:0] mask_operand_o, - output logic [2:0] mask_operand_valid_o, - input logic [2:0] mask_operand_ready_i, + output `STRUCT_VECT(elen_t, [2:0]) mask_operand_o, + output logic [2:0] mask_operand_valid_o, + input logic [2:0] mask_operand_ready_i, input logic masku_result_req_i, input vid_t masku_result_id_i, input vaddr_t masku_result_addr_i, diff --git a/hardware/tb/verilator/waiver.vlt b/hardware/tb/verilator/waiver.vlt new file mode 100644 index 000000000..018a7d585 --- /dev/null +++ b/hardware/tb/verilator/waiver.vlt @@ -0,0 +1,21 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +`verilator_config + +// Hierarchical verilation +hier_block -module "lane" + +// Hierarchical modules will be renamed by Verilator. Disable the DECLFILENAME +// check for those right away +lint_off -rule DECLFILENAME -file "*" -match "*lane*" + +// Ignore duplicate modules, since this is handled by Bender +lint_off -rule MODDUP + +// Ignore missing pins on Ariane +lint_off -rule PINMISSING -file "*/cva6/*" -match "*" + +// Ignore usage of reserved words on Ariane +lint_off -rule SYMRSVDWORD -file "*/cva6/*" -match "*" From 6618ac00a9ae0421ee552dd95f5f4ee9b9fe7612 Mon Sep 17 00:00:00 2001 From: Matheus Cavalcante Date: Wed, 14 Jul 2021 10:01:56 +0200 Subject: [PATCH 12/20] [hardware] :bug: Mute lane instructions with vector length zero --- hardware/src/ara_sequencer.sv | 2 +- hardware/src/lane/lane_sequencer.sv | 7 +++++++ hardware/src/lane/operand_requester.sv | 6 ++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/hardware/src/ara_sequencer.sv b/hardware/src/ara_sequencer.sv index 72aff15fe..da866b87a 100644 --- a/hardware/src/ara_sequencer.sv +++ b/hardware/src/ara_sequencer.sv @@ -241,7 +241,7 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; #( endcase // Masked vector instructions also run on the mask unit - pe_vinsn_running_d[NrLanes + OffsetMask][vinsn_id_n] = !ara_req_i.vm; + pe_vinsn_running_d[NrLanes + OffsetMask][vinsn_id_n] |= !ara_req_i.vm; // Some instructions need to wait for an acknowledgment // before being committed with Ariane diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv index 622de2bfb..a8245f021 100644 --- a/hardware/src/lane/lane_sequencer.sv +++ b/hardware/src/lane/lane_sequencer.sv @@ -190,6 +190,13 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // If lane_id_i < vl % NrLanes, this lane has to execute one extra micro-operation. if (lane_id_i < pe_req_i.vl[idx_width(NrLanes)-1:0]) vfu_operation_d.vl += 1; + // Mute request if the instruction runs in the lane and the vl is zero. + if (vfu_operation_d.vl == '0 && (vfu_operation_d.vfu inside {VFU_Alu, VFU_MFpu})) begin + vfu_operation_valid_d = 1'b0; + // We are already done with this instruction + vinsn_done_d[pe_req_i.id] |= 1'b1; + end + // Vector start calculation vfu_operation_d.vstart = pe_req_i.vstart / NrLanes; // If lane_id_i < vstart % NrLanes, this lane needs to execute one micro-operation less. diff --git a/hardware/src/lane/operand_requester.sv b/hardware/src/lane/operand_requester.sv index 81474c2e9..078fb4c1c 100644 --- a/hardware/src/lane/operand_requester.sv +++ b/hardware/src/lane/operand_requester.sv @@ -283,6 +283,12 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( hazard : operand_request_i[requester].hazard, default: '0 }; + + // Mute the requisition if the vl is zero + if (operand_request_i[requester].vl == '0) begin + state_d = IDLE; + operand_queue_cmd_valid_o[requester] = 1'b0; + end end end From 3eb68622e9ca8ae76639cad3a7bf571d3d927e31 Mon Sep 17 00:00:00 2001 From: Matheus Cavalcante Date: Wed, 14 Jul 2021 11:40:33 +0200 Subject: [PATCH 13/20] [hardware] :bug: Fix `simd_div` index calculation --- hardware/src/lane/simd_div.sv | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/hardware/src/lane/simd_div.sv b/hardware/src/lane/simd_div.sv index 379a0b63e..5cc2d00eb 100644 --- a/hardware/src/lane/simd_div.sv +++ b/hardware/src/lane/simd_div.sv @@ -120,7 +120,7 @@ module simd_div import ara_pkg::*; import rvv_pkg::*; #( // The request was accepted: load how many elements to process/commit load_cnt = 1'b1; // Check if the next byte is valid or not. If not, skip it. - issue_state_d = (be_q[cnt_init_val]) ? ISSUE_VALID : ISSUE_SKIP; + issue_state_d = (be_q[cnt_init_val << vew_q]) ? ISSUE_VALID : ISSUE_SKIP; end ISSUE_VALID: begin // The inputs are valid @@ -134,7 +134,7 @@ module simd_div import ara_pkg::*; import rvv_pkg::*; #( issue_state_d = WAIT_DONE; // If we are not issuing the last operands, decide if to process or skip the next byte end else begin - issue_state_d = (be_q[issue_cnt_d]) ? ISSUE_VALID : ISSUE_SKIP; + issue_state_d = (be_q[issue_cnt_d << vew_q]) ? ISSUE_VALID : ISSUE_SKIP; end end end @@ -146,7 +146,7 @@ module simd_div import ara_pkg::*; import rvv_pkg::*; #( issue_state_d = WAIT_DONE; // If we are not issuing the last operands, decide if to process or skip the next byte end else begin - issue_state_d = (be_q[issue_cnt_d]) ? ISSUE_VALID : ISSUE_SKIP; + issue_state_d = (be_q[issue_cnt_d << vew_q]) ? ISSUE_VALID : ISSUE_SKIP; end end WAIT_DONE: begin @@ -171,7 +171,7 @@ module simd_div import ara_pkg::*; import rvv_pkg::*; #( COMMIT_IDLE: begin // Start if the issue CU has already started if (issue_state_q != ISSUE_IDLE) begin - commit_state_d = (be_q[cnt_init_val]) ? COMMIT_READY : COMMIT_SKIP; + commit_state_d = (be_q[cnt_init_val << vew_q]) ? COMMIT_READY : COMMIT_SKIP; end end COMMIT_READY: begin @@ -184,19 +184,18 @@ module simd_div import ara_pkg::*; import rvv_pkg::*; #( commit_state_d = COMMIT_DONE; // If we are not committing the last result, decide if to process or skip the next one end else begin - commit_state_d = (be_q[commit_cnt_d]) ? COMMIT_READY : COMMIT_SKIP; + commit_state_d = (be_q[commit_cnt_d << vew_q]) ? COMMIT_READY : COMMIT_SKIP; end end end COMMIT_SKIP: begin - serdiv_out_ready = 1'b1; commit_cnt_en = 1'b1; // If we are skipping the last result, complete the execution if (commit_cnt_q == '0) begin commit_state_d = COMMIT_DONE; // If we are not committing the last result, decide if to process or skip the next one end else begin - commit_state_d = (be_q[commit_cnt_d]) ? COMMIT_READY : COMMIT_SKIP; + commit_state_d = (be_q[commit_cnt_d << vew_q]) ? COMMIT_READY : COMMIT_SKIP; end end COMMIT_DONE: begin @@ -329,10 +328,10 @@ module simd_div import ara_pkg::*; import rvv_pkg::*; #( // Output buffer // Shift the partial result and update the output buffer with the new masked byte/halfword/word - // If we are skipping a byte, just shift + // If we are skipping an element, just shift always_comb begin if (commit_state_q == COMMIT_SKIP) begin - shifted_result = result_q << 8; + shifted_result = result_q << (8 << vew_q); serdiv_result_masked = '0; end else begin case (vew_q) From 36f07a7c09bef42047197c9490d525d679497cf6 Mon Sep 17 00:00:00 2001 From: Matheus Cavalcante Date: Wed, 14 Jul 2021 13:44:59 +0200 Subject: [PATCH 14/20] [axi] Update `axi` to v0.29.1 --- hardware/deps/axi | 2 +- hardware/tb/verilator/waiver.vlt | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/hardware/deps/axi b/hardware/deps/axi index b7d2b0b62..442ff3375 160000 --- a/hardware/deps/axi +++ b/hardware/deps/axi @@ -1 +1 @@ -Subproject commit b7d2b0b629dc2bf44789f7e993f757bfc7961b3f +Subproject commit 442ff3375710513623f95944d66cc2bd09b2f155 diff --git a/hardware/tb/verilator/waiver.vlt b/hardware/tb/verilator/waiver.vlt index 018a7d585..e37a34cf1 100644 --- a/hardware/tb/verilator/waiver.vlt +++ b/hardware/tb/verilator/waiver.vlt @@ -17,5 +17,8 @@ lint_off -rule MODDUP // Ignore missing pins on Ariane lint_off -rule PINMISSING -file "*/cva6/*" -match "*" +// Ignore missing pins on the axi_cdc_src +lint_off -rule PINMISSING -file "*/axi/*" -match "*" + // Ignore usage of reserved words on Ariane lint_off -rule SYMRSVDWORD -file "*/cva6/*" -match "*" From 147aaf1a8ff7e0de577fd4fc3ee2b9750634c93b Mon Sep 17 00:00:00 2001 From: Matheus Cavalcante Date: Wed, 14 Jul 2021 13:46:26 +0200 Subject: [PATCH 15/20] [common_cells] Update `common_cells` to v1.22.1 --- Bender.lock | 11 +++++------ Bender.yml | 4 ++-- hardware/deps/common_cells | 2 +- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/Bender.lock b/Bender.lock index 0cb929e0f..ae080d9ac 100644 --- a/Bender.lock +++ b/Bender.lock @@ -1,17 +1,16 @@ --- packages: axi: - revision: b7d2b0b629dc2bf44789f7e993f757bfc7961b3f - version: ~ + revision: 442ff3375710513623f95944d66cc2bd09b2f155 + version: 0.29.1 source: Git: "https://github.com/pulp-platform/axi.git" dependencies: - common_cells - common_verification - - tech_cells_generic common_cells: - revision: 6aeee85d0a34fedc06c14f04fd6363c9f7b4eeea - version: 1.21.0 + revision: 015917ff33e5f944e866814f72f2074fb0f4220f + version: 1.22.1 source: Git: "https://github.com/pulp-platform/common_cells.git" dependencies: @@ -24,7 +23,7 @@ packages: Git: "https://github.com/pulp-platform/common_verification.git" dependencies: [] cva6: - revision: 91e6745380ccd371b2b346e5c9382cf45c6413d3 + revision: 3245e44ec49c1cdcd19eb298cd81f0672eaf81ca version: ~ source: Git: "https://github.com/pulp-platform/cva6.git" diff --git a/Bender.yml b/Bender.yml index 13e608811..89ef7b3fe 100644 --- a/Bender.yml +++ b/Bender.yml @@ -8,8 +8,8 @@ package: - "Paul Scheffler " dependencies: - axi: { git: "https://github.com/pulp-platform/axi.git", rev: axi_llc } - common_cells: { git: "https://github.com/pulp-platform/common_cells.git", version: 1.19.0 } + axi: { git: "https://github.com/pulp-platform/axi.git", version: 0.29.1 } + common_cells: { git: "https://github.com/pulp-platform/common_cells.git", version: 1.22.1 } cva6: { git: "https://github.com/pulp-platform/cva6.git", rev: acc_port } tech_cells_generic: { git: "https://github.com/pulp-platform/tech_cells_generic.git", version: 0.2.1 } diff --git a/hardware/deps/common_cells b/hardware/deps/common_cells index 1705edadc..015917ff3 160000 --- a/hardware/deps/common_cells +++ b/hardware/deps/common_cells @@ -1 +1 @@ -Subproject commit 1705edadceaa2c6dd5bff845796d02156901b05f +Subproject commit 015917ff33e5f944e866814f72f2074fb0f4220f From 1a8afbfdc55562105b79cb34109c8e9f74dc2cad Mon Sep 17 00:00:00 2001 From: Matheus Cavalcante Date: Wed, 14 Jul 2021 13:48:21 +0200 Subject: [PATCH 16/20] [hardware] Replace LLC by an L2 memory --- Bender.yml | 1 + hardware/src/ara_soc.sv | 224 +++------- hardware/src/axi_to_mem.sv | 691 +++++++++++++++++++++++++++++++ hardware/tb/ara_tb.sv | 2 +- hardware/tb/ara_testharness.sv | 130 +----- hardware/tb/verilator/ara_tb.cpp | 2 +- 6 files changed, 772 insertions(+), 278 deletions(-) create mode 100644 hardware/src/axi_to_mem.sv diff --git a/Bender.yml b/Bender.yml index 89ef7b3fe..54e0885c5 100644 --- a/Bender.yml +++ b/Bender.yml @@ -27,6 +27,7 @@ sources: # Sources # Level 1 + - hardware/src/axi_to_mem.sv - hardware/src/ctrl_registers.sv - hardware/src/cva6_accel_first_pass_decoder.sv - hardware/src/ara_dispatcher.sv diff --git a/hardware/src/ara_soc.sv b/hardware/src/ara_soc.sv index 36ae4c3bf..6ac3e2a79 100644 --- a/hardware/src/ara_soc.sv +++ b/hardware/src/ara_soc.sv @@ -16,6 +16,8 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #( parameter int unsigned AxiAddrWidth = 64, parameter int unsigned AxiUserWidth = 1, parameter int unsigned AxiIdWidth = 6, + // Main memory + parameter int unsigned L2NumWords = 2**21, // Dependant parameters. DO NOT CHANGE! localparam type axi_data_t = logic [AxiDataWidth-1:0], localparam type axi_strb_t = logic [AxiDataWidth/8-1:0], @@ -23,68 +25,22 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #( localparam type axi_user_t = logic [AxiUserWidth-1:0], localparam type axi_id_t = logic [AxiIdWidth-1:0] ) ( - input logic clk_i, - input logic rst_ni, - output logic [63:0] exit_o, + input logic clk_i, + input logic rst_ni, + output logic [63:0] exit_o, // Scan chain - input logic scan_enable_i, - input logic scan_data_i, - output logic scan_data_o, + input logic scan_enable_i, + input logic scan_data_i, + output logic scan_data_o, // UART APB interface - output logic uart_penable_o, - output logic uart_pwrite_o, - output logic [31:0] uart_paddr_o, - output logic uart_psel_o, - output logic [31:0] uart_pwdata_o, - input logic [31:0] uart_prdata_i, - input logic uart_pready_i, - input logic uart_pslverr_i, - // AXI interface - output logic axi_aw_valid_o, - output axi_id_t axi_aw_id_o, - output axi_addr_t axi_aw_addr_o, - output len_t axi_aw_len_o, - output size_t axi_aw_size_o, - output burst_t axi_aw_burst_o, - output logic axi_aw_lock_o, - output cache_t axi_aw_cache_o, - output prot_t axi_aw_prot_o, - output qos_t axi_aw_qos_o, - output region_t axi_aw_region_o, - output atop_t axi_aw_atop_o, - output axi_user_t axi_aw_user_o, - input logic axi_aw_ready_i, - output logic axi_w_valid_o, - output axi_data_t axi_w_data_o, - output axi_strb_t axi_w_strb_o, - output logic axi_w_last_o, - output axi_user_t axi_w_user_o, - input logic axi_w_ready_i, - input logic axi_b_valid_i, - input axi_id_t axi_b_id_i, - input resp_t axi_b_resp_i, - input axi_user_t axi_b_user_i, - output logic axi_b_ready_o, - output logic axi_ar_valid_o, - output axi_id_t axi_ar_id_o, - output axi_addr_t axi_ar_addr_o, - output len_t axi_ar_len_o, - output size_t axi_ar_size_o, - output burst_t axi_ar_burst_o, - output logic axi_ar_lock_o, - output cache_t axi_ar_cache_o, - output prot_t axi_ar_prot_o, - output qos_t axi_ar_qos_o, - output region_t axi_ar_region_o, - output axi_user_t axi_ar_user_o, - input logic axi_ar_ready_i, - input logic axi_r_valid_i, - input axi_id_t axi_r_id_i, - input axi_data_t axi_r_data_i, - input resp_t axi_r_resp_i, - input logic axi_r_last_i, - input axi_user_t axi_r_user_i, - output logic axi_r_ready_o + output logic uart_penable_o, + output logic uart_pwrite_o, + output logic [31:0] uart_paddr_o, + output logic uart_psel_o, + output logic [31:0] uart_pwdata_o, + input logic [31:0] uart_prdata_i, + input logic uart_pready_i, + input logic uart_pslverr_i ); ////////////////////// @@ -207,9 +163,9 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #( MaxSlvTrans : 4, FallThrough : 1'b0, LatencyMode : axi_pkg::CUT_MST_PORTS, - PipelineStages : 0, AxiIdWidthSlvPorts: AxiCoreIdWidth, AxiIdUsedSlvPorts : AxiCoreIdWidth, + UniqueIds : 1'b0, AxiAddrWidth : AxiAddrWidth, AxiDataWidth : AxiWideDataWidth, NoAddrRules : NrAXISlaves @@ -255,6 +211,8 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #( // L2 // ////////// + `include "common_cells/registers.svh" + // The L2 memory does not support atomics axi_soc_wide_req_t l2mem_wide_axi_req_wo_atomics; @@ -273,109 +231,55 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #( .mst_resp_i(l2mem_wide_axi_resp_wo_atomics) ); - // AXI interface with the DRAM - axi_req_t dram_wide_axi_req; - axi_resp_t dram_wide_axi_resp; - - axi_llc_top #( - .SetAssociativity (8 ), - .NumLines (64 ), - .NumBlocks (8 ), - .AxiIdWidth (AxiSocIdWidth ), - .AxiAddrWidth (AxiAddrWidth ), - .AxiDataWidth (AxiWideDataWidth ), - .AxiUserWidth (AxiUserWidth ), - .AxiLiteAddrWidth (AxiAddrWidth ), - .AxiLiteDataWidth (AxiNarrowDataWidth ), - .slv_req_t (axi_soc_wide_req_t ), - .slv_resp_t (axi_soc_wide_resp_t ), - .mst_req_t (axi_req_t ), - .mst_resp_t (axi_resp_t ), - .lite_req_t (axi_lite_soc_narrow_req_t ), - .lite_resp_t (axi_lite_soc_narrow_resp_t), - .rule_full_t (axi_pkg::xbar_rule_64_t ), - .axi_addr_t (axi_addr_t ) - ) i_l2 ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .test_i (1'b0 ), - .slv_req_i (l2mem_wide_axi_req_wo_atomics ), - .slv_resp_o (l2mem_wide_axi_resp_wo_atomics), - .mst_req_o (dram_wide_axi_req ), - .mst_resp_i (dram_wide_axi_resp ), - .conf_req_i ('0 ), - .conf_resp_o (/* Unused */ ), - .cached_start_addr_i (DRAMBase ), - .cached_end_addr_i (DRAMBase + DRAMLength ), - .spm_start_addr_i ('0 ), - .axi_llc_events_o (/* Unused */ ) + logic l2_req; + logic l2_we; + logic [AxiAddrWidth-1:0] l2_addr; + logic [AxiDataWidth/8-1:0] l2_be; + logic [AxiDataWidth-1:0] l2_wdata; + logic [AxiDataWidth-1:0] l2_rdata; + logic l2_rvalid; + + axi_to_mem #( + .AddrWidth (AxiAddrWidth ), + .DataWidth (AxiDataWidth ), + .IdWidth (AxiSocIdWidth ), + .NumBanks (1 ), + .axi_req_t (axi_soc_wide_req_t ), + .axi_resp_t(axi_soc_wide_resp_t) + ) i_axi_to_mem ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .axi_req_i (l2mem_wide_axi_req_wo_atomics ), + .axi_resp_o (l2mem_wide_axi_resp_wo_atomics), + .mem_req_o (l2_req ), + .mem_gnt_i (l2_req ), // Always available + .mem_we_o (l2_we ), + .mem_addr_o (l2_addr ), + .mem_strb_o (l2_be ), + .mem_wdata_o (l2_wdata ), + .mem_rdata_i (l2_rdata ), + .mem_rvalid_i(l2_rvalid ), + .mem_atop_o (/* Unused */ ), + .busy_o (/* Unused */ ) ); - axi_req_t dram_wide_axi_req_cut; - axi_resp_t dram_wide_axi_resp_cut; - - axi_cut #( - .ar_chan_t(ar_chan_t ), - .r_chan_t (r_chan_t ), - .aw_chan_t(aw_chan_t ), - .w_chan_t (w_chan_t ), - .b_chan_t (b_chan_t ), - .req_t (axi_req_t ), - .resp_t (axi_resp_t) - ) i_dram_axi_cut ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .slv_req_i (dram_wide_axi_req ), - .slv_resp_o(dram_wide_axi_resp ), - .mst_req_o (dram_wide_axi_req_cut ), - .mst_resp_i(dram_wide_axi_resp_cut) + tc_sram #( + .NumWords (L2NumWords ), + .NumPorts (1 ), + .DataWidth(AxiDataWidth) + ) i_dram ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .req_i (l2_req ), + .we_i (l2_we ), + .addr_i (l2_addr[$clog2(L2NumWords)-1+$clog2(AxiDataWidth/8):$clog2(AxiDataWidth/8)]), + .wdata_i(l2_wdata ), + .be_i (l2_be ), + .rdata_o(l2_rdata ) ); - assign axi_aw_valid_o = dram_wide_axi_req_cut.aw_valid; - assign axi_aw_id_o = dram_wide_axi_req_cut.aw.id; - assign axi_aw_addr_o = dram_wide_axi_req_cut.aw.addr; - assign axi_aw_len_o = dram_wide_axi_req_cut.aw.len; - assign axi_aw_size_o = dram_wide_axi_req_cut.aw.size; - assign axi_aw_burst_o = dram_wide_axi_req_cut.aw.burst; - assign axi_aw_lock_o = dram_wide_axi_req_cut.aw.lock; - assign axi_aw_cache_o = dram_wide_axi_req_cut.aw.cache; - assign axi_aw_prot_o = dram_wide_axi_req_cut.aw.prot; - assign axi_aw_qos_o = dram_wide_axi_req_cut.aw.qos; - assign axi_aw_region_o = dram_wide_axi_req_cut.aw.region; - assign axi_aw_atop_o = dram_wide_axi_req_cut.aw.atop; - assign axi_aw_user_o = dram_wide_axi_req_cut.aw.user; - assign dram_wide_axi_resp_cut.aw_ready = axi_aw_ready_i; - assign axi_w_valid_o = dram_wide_axi_req_cut.w_valid; - assign axi_w_data_o = dram_wide_axi_req_cut.w.data; - assign axi_w_strb_o = dram_wide_axi_req_cut.w.strb; - assign axi_w_last_o = dram_wide_axi_req_cut.w.last; - assign axi_w_user_o = dram_wide_axi_req_cut.w.user; - assign dram_wide_axi_resp_cut.w_ready = axi_w_ready_i; - assign dram_wide_axi_resp_cut.b_valid = axi_b_valid_i; - assign dram_wide_axi_resp_cut.b.id = axi_b_id_i; - assign dram_wide_axi_resp_cut.b.resp = axi_b_resp_i; - assign dram_wide_axi_resp_cut.b.user = axi_b_user_i; - assign axi_b_ready_o = dram_wide_axi_req_cut.b_ready; - assign axi_ar_valid_o = dram_wide_axi_req_cut.ar_valid; - assign axi_ar_id_o = dram_wide_axi_req_cut.ar.id; - assign axi_ar_addr_o = dram_wide_axi_req_cut.ar.addr; - assign axi_ar_len_o = dram_wide_axi_req_cut.ar.len; - assign axi_ar_size_o = dram_wide_axi_req_cut.ar.size; - assign axi_ar_burst_o = dram_wide_axi_req_cut.ar.burst; - assign axi_ar_lock_o = dram_wide_axi_req_cut.ar.lock; - assign axi_ar_cache_o = dram_wide_axi_req_cut.ar.cache; - assign axi_ar_prot_o = dram_wide_axi_req_cut.ar.prot; - assign axi_ar_qos_o = dram_wide_axi_req_cut.ar.qos; - assign axi_ar_region_o = dram_wide_axi_req_cut.ar.region; - assign axi_ar_user_o = dram_wide_axi_req_cut.ar.user; - assign dram_wide_axi_resp_cut.ar_ready = axi_ar_ready_i; - assign dram_wide_axi_resp_cut.r_valid = axi_r_valid_i; - assign dram_wide_axi_resp_cut.r.data = axi_r_data_i; - assign dram_wide_axi_resp_cut.r.id = axi_r_id_i; - assign dram_wide_axi_resp_cut.r.last = axi_r_last_i; - assign dram_wide_axi_resp_cut.r.resp = axi_r_resp_i; - assign dram_wide_axi_resp_cut.r.user = axi_r_user_i; - assign axi_r_ready_o = dram_wide_axi_req_cut.r_ready; + // One-cycle latency + `FF(l2_rvalid, l2_req, 1'b0); //////////// // UART // diff --git a/hardware/src/axi_to_mem.sv b/hardware/src/axi_to_mem.sv new file mode 100644 index 000000000..7a3db70de --- /dev/null +++ b/hardware/src/axi_to_mem.sv @@ -0,0 +1,691 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 +// +// Authors: +// - Andreas Kurth +// - Wolfgang Roenninger + +`include "common_cells/registers.svh" +/// AXI4+ATOP slave module which translates AXI bursts into a memory stream. +/// If both read and write channels of the AXI4+ATOP are active, both will have an +/// utilization of 50%. +module axi_to_mem #( + /// AXI4+ATOP request type. See `include/axi/typedef.svh`. + parameter type axi_req_t = logic, + /// AXI4+ATOP response type. See `include/axi/typedef.svh`. + parameter type axi_resp_t = logic, + /// Address width, has to be less or equal than the width off the AXI address field. + /// Determines the width of `mem_addr_o`. Has to be wide enough to emit the memory region + /// which should be accessible. + parameter int unsigned AddrWidth = 0, + /// AXI4+ATOP data width. + parameter int unsigned DataWidth = 0, + /// AXI4+ATOP ID width. + parameter int unsigned IdWidth = 0, + /// Number of banks at output, must evenly divide `DataWidth`. + parameter int unsigned NumBanks = 0, + /// Depth of memory response buffer. This should be equal to the memory response latency. + parameter int unsigned BufDepth = 1, + /// Dependent parameter, do not override. Memory address type. + localparam type addr_t = logic [AddrWidth-1:0], + /// Dependent parameter, do not override. Memory data type. + localparam type mem_data_t = logic [DataWidth/NumBanks-1:0], + /// Dependent parameter, do not override. Memory write strobe type. + localparam type mem_strb_t = logic [DataWidth/NumBanks/8-1:0] +) ( + /// Clock input. + input logic clk_i, + /// Asynchronous reset, active low. + input logic rst_ni, + /// The unit is busy handling an AXI4+ATOP request. + output logic busy_o, + /// AXI4+ATOP slave port, request input. + input axi_req_t axi_req_i, + /// AXI4+ATOP slave port, response output. + output axi_resp_t axi_resp_o, + /// Memory stream master, request is valid for this bank. + output logic [NumBanks-1:0] mem_req_o, + /// Memory stream master, request can be granted by this bank. + input logic [NumBanks-1:0] mem_gnt_i, + /// Memory stream master, byte address of the request. + output addr_t [NumBanks-1:0] mem_addr_o, + /// Memory stream master, write data for this bank. Valid when `mem_req_o`. + output mem_data_t [NumBanks-1:0] mem_wdata_o, + /// Memory stream master, byte-wise strobe (byte enable). + output mem_strb_t [NumBanks-1:0] mem_strb_o, + /// Memory stream master, `axi_pkg::atop_t` signal associated with this request. + output axi_pkg::atop_t [NumBanks-1:0] mem_atop_o, + /// Memory stream master, write enable. Then asserted store of `mem_w_data` is requested. + output logic [NumBanks-1:0] mem_we_o, + /// Memory stream master, response is valid. This module expects always a response valid for a + /// request regardless if the request was a write or a read. + input logic [NumBanks-1:0] mem_rvalid_i, + /// Memory stream master, read response data. + input mem_data_t [NumBanks-1:0] mem_rdata_i +); + + typedef logic [DataWidth-1:0] axi_data_t; + typedef logic [DataWidth/8-1:0] axi_strb_t; + typedef logic [IdWidth-1:0] axi_id_t; + + typedef struct packed { + addr_t addr; + axi_pkg::atop_t atop; + axi_strb_t strb; + axi_data_t wdata; + logic we; + } mem_req_t; + + typedef struct packed { + addr_t addr; + axi_pkg::atop_t atop; + axi_id_t id; + logic last; + axi_pkg::qos_t qos; + axi_pkg::size_t size; + logic write; + } meta_t; + + axi_data_t mem_rdata, + m2s_resp; + axi_pkg::len_t r_cnt_d, r_cnt_q, + w_cnt_d, w_cnt_q; + logic arb_valid, arb_ready, + rd_valid, rd_ready, + wr_valid, wr_ready, + sel_b, sel_buf_b, + sel_r, sel_buf_r, + sel_valid, sel_ready, + sel_buf_valid, sel_buf_ready, + sel_lock_d, sel_lock_q, + meta_valid, meta_ready, + meta_buf_valid, meta_buf_ready, + meta_sel_d, meta_sel_q, + m2s_req_valid, m2s_req_ready, + m2s_resp_valid, m2s_resp_ready, + mem_req_valid, mem_req_ready, + mem_rvalid; + mem_req_t m2s_req, + mem_req; + meta_t rd_meta, + rd_meta_d, rd_meta_q, + wr_meta, + wr_meta_d, wr_meta_q, + meta, meta_buf; + + assign busy_o = axi_req_i.aw_valid | axi_req_i.ar_valid | axi_req_i.w_valid | + axi_resp_o.b_valid | axi_resp_o.r_valid | + (r_cnt_q > 0) | (w_cnt_q > 0); + + // Handle reads. + always_comb begin + // Default assignments + axi_resp_o.ar_ready = 1'b0; + rd_meta_d = rd_meta_q; + rd_meta = meta_t'{default: '0}; + rd_valid = 1'b0; + r_cnt_d = r_cnt_q; + // Handle R burst in progress. + if (r_cnt_q > '0) begin + rd_meta_d.last = (r_cnt_q == 8'd1); + rd_meta = rd_meta_d; + rd_meta.addr = rd_meta_q.addr + axi_pkg::num_bytes(rd_meta_q.size); + rd_valid = 1'b1; + if (rd_ready) begin + r_cnt_d--; + rd_meta_d.addr = rd_meta.addr; + end + // Handle new AR if there is one. + end else if (axi_req_i.ar_valid) begin + rd_meta_d = '{ + addr: addr_t'(axi_pkg::aligned_addr(axi_req_i.ar.addr, axi_req_i.ar.size)), + atop: '0, + id: axi_req_i.ar.id, + last: (axi_req_i.ar.len == '0), + qos: axi_req_i.ar.qos, + size: axi_req_i.ar.size, + write: 1'b0 + }; + rd_meta = rd_meta_d; + rd_meta.addr = addr_t'(axi_req_i.ar.addr); + rd_valid = 1'b1; + if (rd_ready) begin + r_cnt_d = axi_req_i.ar.len; + axi_resp_o.ar_ready = 1'b1; + end + end + end + + // Handle writes. + always_comb begin + // Default assignments + axi_resp_o.aw_ready = 1'b0; + axi_resp_o.w_ready = 1'b0; + wr_meta_d = wr_meta_q; + wr_meta = meta_t'{default: '0}; + wr_valid = 1'b0; + w_cnt_d = w_cnt_q; + // Handle W bursts in progress. + if (w_cnt_q > '0) begin + wr_meta_d.last = (w_cnt_q == 8'd1); + wr_meta = wr_meta_d; + wr_meta.addr = wr_meta_q.addr + axi_pkg::num_bytes(wr_meta_q.size); + if (axi_req_i.w_valid) begin + wr_valid = 1'b1; + if (wr_ready) begin + axi_resp_o.w_ready = 1'b1; + w_cnt_d--; + wr_meta_d.addr = wr_meta.addr; + end + end + // Handle new AW if there is one. + end else if (axi_req_i.aw_valid && axi_req_i.w_valid) begin + wr_meta_d = '{ + addr: addr_t'(axi_pkg::aligned_addr(axi_req_i.aw.addr, axi_req_i.aw.size)), + atop: axi_req_i.aw.atop, + id: axi_req_i.aw.id, + last: (axi_req_i.aw.len == '0), + qos: axi_req_i.aw.qos, + size: axi_req_i.aw.size, + write: 1'b1 + }; + wr_meta = wr_meta_d; + wr_meta.addr = addr_t'(axi_req_i.aw.addr); + wr_valid = 1'b1; + if (wr_ready) begin + w_cnt_d = axi_req_i.aw.len; + axi_resp_o.aw_ready = 1'b1; + axi_resp_o.w_ready = 1'b1; + end + end + end + + // Arbitrate between reads and writes. + stream_mux #( + .DATA_T ( meta_t ), + .N_INP ( 32'd2 ) + ) i_ax_mux ( + .inp_data_i ({wr_meta, rd_meta }), + .inp_valid_i ({wr_valid, rd_valid}), + .inp_ready_o ({wr_ready, rd_ready}), + .inp_sel_i ( meta_sel_d ), + .oup_data_o ( meta ), + .oup_valid_o ( arb_valid ), + .oup_ready_i ( arb_ready ) + ); + always_comb begin + meta_sel_d = meta_sel_q; + sel_lock_d = sel_lock_q; + if (sel_lock_q) begin + meta_sel_d = meta_sel_q; + if (arb_valid && arb_ready) begin + sel_lock_d = 1'b0; + end + end else begin + if (wr_valid ^ rd_valid) begin + // If either write or read is valid but not both, select the valid one. + meta_sel_d = wr_valid; + end else if (wr_valid && rd_valid) begin + // If both write and read are valid, decide according to QoS then burst properties. + // Prioritize higher QoS. + if (wr_meta.qos > rd_meta.qos) begin + meta_sel_d = 1'b1; + end else if (rd_meta.qos > wr_meta.qos) begin + meta_sel_d = 1'b0; + // Decide requests with identical QoS. + end else if (wr_meta.qos == rd_meta.qos) begin + // 1. Prioritize individual writes over read bursts. + // Rationale: Read bursts can be interleaved on AXI but write bursts cannot. + if (wr_meta.last && !rd_meta.last) begin + meta_sel_d = 1'b1; + // 2. Prioritize ongoing burst. + // Rationale: Stalled bursts create back-pressure or require costly buffers. + end else if (w_cnt_q > '0) begin + meta_sel_d = 1'b1; + end else if (r_cnt_q > '0) begin + meta_sel_d = 1'b0; + // 3. Otherwise arbitrate round robin to prevent starvation. + end else begin + meta_sel_d = ~meta_sel_q; + end + end + end + // Lock arbitration if valid but not yet ready. + if (arb_valid && !arb_ready) begin + sel_lock_d = 1'b1; + end + end + end + + // Fork arbitrated stream to meta data, memory requests, and R/B channel selection. + stream_fork #( + .N_OUP ( 32'd3 ) + ) i_fork ( + .clk_i, + .rst_ni, + .valid_i ( arb_valid ), + .ready_o ( arb_ready ), + .valid_o ({sel_valid, meta_valid, m2s_req_valid}), + .ready_i ({sel_ready, meta_ready, m2s_req_ready}) + ); + + assign sel_b = meta.write & meta.last; + assign sel_r = ~meta.write | meta.atop[5]; + + stream_fifo #( + .FALL_THROUGH ( 1'b1 ), + .DEPTH ( 32'd1 + BufDepth ), + .T ( logic[1:0] ) + ) i_sel_buf ( + .clk_i, + .rst_ni, + .flush_i ( 1'b0 ), + .testmode_i ( 1'b0 ), + .data_i ({sel_b, sel_r }), + .valid_i ( sel_valid ), + .ready_o ( sel_ready ), + .data_o ({sel_buf_b, sel_buf_r}), + .valid_o ( sel_buf_valid ), + .ready_i ( sel_buf_ready ), + .usage_o ( /* unused */ ) + ); + + stream_fifo #( + .FALL_THROUGH ( 1'b1 ), + .DEPTH ( 32'd1 + BufDepth ), + .T ( meta_t ) + ) i_meta_buf ( + .clk_i, + .rst_ni, + .flush_i ( 1'b0 ), + .testmode_i ( 1'b0 ), + .data_i ( meta ), + .valid_i ( meta_valid ), + .ready_o ( meta_ready ), + .data_o ( meta_buf ), + .valid_o ( meta_buf_valid ), + .ready_i ( meta_buf_ready ), + .usage_o ( /* unused */ ) + ); + + // Assemble the actual memory request from meta information and write data. + assign m2s_req = mem_req_t'{ + addr: meta.addr, + atop: meta.atop, + strb: axi_req_i.w.strb, + wdata: axi_req_i.w.data, + we: meta.write + }; + + // Interface memory as stream. + stream_to_mem #( + .mem_req_t ( mem_req_t ), + .mem_resp_t ( axi_data_t ), + .BufDepth ( BufDepth ) + ) i_stream_to_mem ( + .clk_i, + .rst_ni, + .req_i ( m2s_req ), + .req_valid_i ( m2s_req_valid ), + .req_ready_o ( m2s_req_ready ), + .resp_o ( m2s_resp ), + .resp_valid_o ( m2s_resp_valid ), + .resp_ready_i ( m2s_resp_ready ), + .mem_req_o ( mem_req ), + .mem_req_valid_o ( mem_req_valid ), + .mem_req_ready_i ( mem_req_ready ), + .mem_resp_i ( mem_rdata ), + .mem_resp_valid_i ( mem_rvalid ) + ); + + // Split single memory request to desired number of banks. + mem_to_banks #( + .AddrWidth ( AddrWidth ), + .DataWidth ( DataWidth ), + .NumBanks ( NumBanks ) + ) i_mem_to_banks ( + .clk_i, + .rst_ni, + .req_i ( mem_req_valid ), + .gnt_o ( mem_req_ready ), + .addr_i ( mem_req.addr ), + .wdata_i ( mem_req.wdata ), + .strb_i ( mem_req.strb ), + .atop_i ( mem_req.atop ), + .we_i ( mem_req.we ), + .rvalid_o ( mem_rvalid ), + .rdata_o ( mem_rdata ), + .bank_req_o ( mem_req_o ), + .bank_gnt_i ( mem_gnt_i ), + .bank_addr_o ( mem_addr_o ), + .bank_wdata_o ( mem_wdata_o ), + .bank_strb_o ( mem_strb_o ), + .bank_atop_o ( mem_atop_o ), + .bank_we_o ( mem_we_o ), + .bank_rvalid_i ( mem_rvalid_i ), + .bank_rdata_i ( mem_rdata_i ) + ); + + // Join memory read data and meta data stream. + logic mem_join_valid, mem_join_ready; + stream_join #( + .N_INP ( 32'd2 ) + ) i_join ( + .inp_valid_i ({m2s_resp_valid, meta_buf_valid}), + .inp_ready_o ({m2s_resp_ready, meta_buf_ready}), + .oup_valid_o ( mem_join_valid ), + .oup_ready_i ( mem_join_ready ) + ); + + // Dynamically fork the joined stream to B and R channels. + stream_fork_dynamic #( + .N_OUP ( 32'd2 ) + ) i_fork_dynamic ( + .clk_i, + .rst_ni, + .valid_i ( mem_join_valid ), + .ready_o ( mem_join_ready ), + .sel_i ({sel_buf_b, sel_buf_r }), + .sel_valid_i ( sel_buf_valid ), + .sel_ready_o ( sel_buf_ready ), + .valid_o ({axi_resp_o.b_valid, axi_resp_o.r_valid}), + .ready_i ({axi_req_i.b_ready, axi_req_i.r_ready }) + ); + + // Compose B responses. + assign axi_resp_o.b = '{ + id: meta_buf.id, + resp: axi_pkg::RESP_OKAY, + user: '0 + }; + + // Compose R responses. + assign axi_resp_o.r = '{ + data: m2s_resp, + id: meta_buf.id, + last: meta_buf.last, + resp: axi_pkg::RESP_OKAY, + user: '0 + }; + + // Registers + `FFARN(meta_sel_q, meta_sel_d, 1'b0, clk_i, rst_ni) + `FFARN(sel_lock_q, sel_lock_d, 1'b0, clk_i, rst_ni) + `FFARN(rd_meta_q, rd_meta_d, meta_t'{default: '0}, clk_i, rst_ni) + `FFARN(wr_meta_q, wr_meta_d, meta_t'{default: '0}, clk_i, rst_ni) + `FFARN(r_cnt_q, r_cnt_d, '0, clk_i, rst_ni) + `FFARN(w_cnt_q, w_cnt_d, '0, clk_i, rst_ni) + + // Assertions + // pragma translate_off + `ifndef VERILATOR + default disable iff (!rst_ni); + assume property (@(posedge clk_i) + axi_req_i.ar_valid && !axi_resp_o.ar_ready |=> $stable(axi_req_i.ar)) + else $error("AR must remain stable until handshake has happened!"); + assert property (@(posedge clk_i) + axi_resp_o.r_valid && !axi_req_i.r_ready |=> $stable(axi_resp_o.r)) + else $error("R must remain stable until handshake has happened!"); + assume property (@(posedge clk_i) + axi_req_i.aw_valid && !axi_resp_o.aw_ready |=> $stable(axi_req_i.aw)) + else $error("AW must remain stable until handshake has happened!"); + assume property (@(posedge clk_i) + axi_req_i.w_valid && !axi_resp_o.w_ready |=> $stable(axi_req_i.w)) + else $error("W must remain stable until handshake has happened!"); + assert property (@(posedge clk_i) + axi_resp_o.b_valid && !axi_req_i.b_ready |=> $stable(axi_resp_o.b)) + else $error("B must remain stable until handshake has happened!"); + assert property (@(posedge clk_i) axi_req_i.ar_valid && axi_req_i.ar.len > 0 |-> + axi_req_i.ar.burst == axi_pkg::BURST_INCR) + else $error("Non-incrementing bursts are not supported!"); + assert property (@(posedge clk_i) axi_req_i.aw_valid && axi_req_i.aw.len > 0 |-> + axi_req_i.aw.burst == axi_pkg::BURST_INCR) + else $error("Non-incrementing bursts are not supported!"); + assert property (@(posedge clk_i) meta_valid && meta.atop != '0 |-> meta.write) + else $warning("Unexpected atomic operation on read."); + `endif + // pragma translate_on +endmodule + + +`include "axi/assign.svh" +`include "axi/typedef.svh" +/// Interface wrapper for module `axi_to_mem`. +module axi_to_mem_intf #( + /// See `axi_to_mem`, parameter `AddrWidth`. + parameter int unsigned ADDR_WIDTH = 32'd0, + /// See `axi_to_mem`, parameter `DataWidth`. + parameter int unsigned DATA_WIDTH = 32'd0, + /// AXI4+ATOP ID width. + parameter int unsigned ID_WIDTH = 32'd0, + /// AXI4+ATOP user width. + parameter int unsigned USER_WIDTH = 32'd0, + /// See `axi_to_mem`, parameter `NumBanks`. + parameter int unsigned NUM_BANKS = 32'd0, + /// See `axi_to_mem`, parameter `BufDepth`. + parameter int unsigned BUF_DEPTH = 32'd1, + /// Dependent parameter, do not override. See `axi_to_mem`, parameter `addr_t`. + localparam type addr_t = logic [ADDR_WIDTH-1:0], + /// Dependent parameter, do not override. See `axi_to_mem`, parameter `mem_data_t`. + localparam type mem_data_t = logic [DATA_WIDTH/NUM_BANKS-1:0], + /// Dependent parameter, do not override. See `axi_to_mem`, parameter `mem_strb_t`. + localparam type mem_strb_t = logic [DATA_WIDTH/NUM_BANKS/8-1:0] +) ( + /// Clock input. + input logic clk_i, + /// Asynchronous reset, active low. + input logic rst_ni, + /// See `axi_to_mem`, port `busy_o`. + output logic busy_o, + /// AXI4+ATOP slave interface port. + AXI_BUS.Slave slv, + /// See `axi_to_mem`, port `mem_req_o`. + output logic [NUM_BANKS-1:0] mem_req_o, + /// See `axi_to_mem`, port `mem_gnt_i`. + input logic [NUM_BANKS-1:0] mem_gnt_i, + /// See `axi_to_mem`, port `mem_addr_o`. + output addr_t [NUM_BANKS-1:0] mem_addr_o, + /// See `axi_to_mem`, port `mem_wdata_o`. + output mem_data_t [NUM_BANKS-1:0] mem_wdata_o, + /// See `axi_to_mem`, port `mem_strb_o`. + output mem_strb_t [NUM_BANKS-1:0] mem_strb_o, + /// See `axi_to_mem`, port `mem_atop_o`. + output axi_pkg::atop_t [NUM_BANKS-1:0] mem_atop_o, + /// See `axi_to_mem`, port `mem_we_o`. + output logic [NUM_BANKS-1:0] mem_we_o, + /// See `axi_to_mem`, port `mem_rvalid_i`. + input logic [NUM_BANKS-1:0] mem_rvalid_i, + /// See `axi_to_mem`, port `mem_rdata_i`. + input mem_data_t [NUM_BANKS-1:0] mem_rdata_i +); + typedef logic [ID_WIDTH-1:0] id_t; + typedef logic [DATA_WIDTH-1:0] data_t; + typedef logic [DATA_WIDTH/8-1:0] strb_t; + typedef logic [USER_WIDTH-1:0] user_t; + `AXI_TYPEDEF_AW_CHAN_T(aw_chan_t, addr_t, id_t, user_t) + `AXI_TYPEDEF_W_CHAN_T(w_chan_t, data_t, strb_t, user_t) + `AXI_TYPEDEF_B_CHAN_T(b_chan_t, id_t, user_t) + `AXI_TYPEDEF_AR_CHAN_T(ar_chan_t, addr_t, id_t, user_t) + `AXI_TYPEDEF_R_CHAN_T(r_chan_t, data_t, id_t, user_t) + `AXI_TYPEDEF_REQ_T(req_t, aw_chan_t, w_chan_t, ar_chan_t) + `AXI_TYPEDEF_RESP_T(resp_t, b_chan_t, r_chan_t) + req_t req; + resp_t resp; + `AXI_ASSIGN_TO_REQ(req, slv) + `AXI_ASSIGN_FROM_RESP(slv, resp) + axi_to_mem #( + .axi_req_t ( req_t ), + .axi_resp_t ( resp_t ), + .AddrWidth ( ADDR_WIDTH ), + .DataWidth ( DATA_WIDTH ), + .IdWidth ( ID_WIDTH ), + .NumBanks ( NUM_BANKS ), + .BufDepth ( BUF_DEPTH ) + ) i_axi_to_mem ( + .clk_i, + .rst_ni, + .busy_o, + .axi_req_i ( req ), + .axi_resp_o ( resp ), + .mem_req_o, + .mem_gnt_i, + .mem_addr_o, + .mem_wdata_o, + .mem_strb_o, + .mem_atop_o, + .mem_we_o, + .mem_rvalid_i, + .mem_rdata_i + ); +endmodule + +/// Split memory access over multiple parallel banks, where each bank has its own req/gnt +/// request and valid response direction. +module mem_to_banks #( + /// Input address width. + parameter int unsigned AddrWidth = 32'd0, + /// Input data width, must be a power of two. + parameter int unsigned DataWidth = 32'd0, + /// Number of banks at output, must evenly divide `DataWidth`. + parameter int unsigned NumBanks = 32'd0, + /// Dependent parameter, do not override! Address type. + localparam type addr_t = logic [AddrWidth-1:0], + /// Dependent parameter, do not override! Input data type. + localparam type inp_data_t = logic [DataWidth-1:0], + /// Dependent parameter, do not override! Input write strobe type. + localparam type inp_strb_t = logic [DataWidth/8-1:0], + /// Dependent parameter, do not override! Output data type. + localparam type oup_data_t = logic [DataWidth/NumBanks-1:0], + /// Dependent parameter, do not override! Output write strobe type. + localparam type oup_strb_t = logic [DataWidth/NumBanks/8-1:0] +) ( + /// Clock input. + input logic clk_i, + /// Asynchronous reset, active low. + input logic rst_ni, + /// Memory request to split, request is valid. + input logic req_i, + /// Memory request to split, request can be granted. + output logic gnt_o, + /// Memory request to split, request address, byte-wise. + input addr_t addr_i, + /// Memory request to split, request write data. + input inp_data_t wdata_i, + /// Memory request to split, request write strobe. + input inp_strb_t strb_i, + /// Memory request to split, request Atomic signal from AXI4+ATOP. + input axi_pkg::atop_t atop_i, + /// Memory request to split, request write enable, active high. + input logic we_i, + /// Memory request to split, response is valid. Required for read and write requests + output logic rvalid_o, + /// Memory request to split, response read data. + output inp_data_t rdata_o, + /// Memory bank request, request is valid. + output logic [NumBanks-1:0] bank_req_o, + /// Memory bank request, request can be granted. + input logic [NumBanks-1:0] bank_gnt_i, + /// Memory bank request, request address, byte-wise. Will be different for each bank. + output addr_t [NumBanks-1:0] bank_addr_o, + /// Memory bank request, request write data. + output oup_data_t [NumBanks-1:0] bank_wdata_o, + /// Memory bank request, request write strobe. + output oup_strb_t [NumBanks-1:0] bank_strb_o, + /// Memory bank request, request Atomic signal from AXI4+ATOP. + output axi_pkg::atop_t [NumBanks-1:0] bank_atop_o, + /// Memory bank request, request write enable, active high. + output logic [NumBanks-1:0] bank_we_o, + /// Memory bank request, response is valid. Required for read and write requests + input logic [NumBanks-1:0] bank_rvalid_i, + /// Memory bank request, response read data. + input oup_data_t [NumBanks-1:0] bank_rdata_i +); + + localparam DataBytes = $bits(inp_strb_t); + localparam BitsPerBank = $bits(oup_data_t); + localparam BytesPerBank = $bits(oup_strb_t); + + typedef struct packed { + addr_t addr; + oup_data_t wdata; + oup_strb_t strb; + axi_pkg::atop_t atop; + logic we; + } req_t; + + logic req_valid; + logic [NumBanks-1:0] req_ready, + resp_valid, resp_ready; + req_t [NumBanks-1:0] bank_req, + bank_oup; + + function automatic addr_t align_addr(input addr_t addr); + return (addr >> $clog2(DataBytes)) << $clog2(DataBytes); + endfunction + + // Handle requests. + assign req_valid = req_i & gnt_o; + for (genvar i = 0; unsigned'(i) < NumBanks; i++) begin : gen_reqs + assign bank_req[i].addr = align_addr(addr_i) + i * BytesPerBank; + assign bank_req[i].wdata = wdata_i[i*BitsPerBank+:BitsPerBank]; + assign bank_req[i].strb = strb_i[i*BytesPerBank+:BytesPerBank]; + assign bank_req[i].atop = atop_i; + assign bank_req[i].we = we_i; + fall_through_register #( + .T ( req_t ) + ) i_ft_reg ( + .clk_i, + .rst_ni, + .clr_i ( 1'b0 ), + .testmode_i ( 1'b0 ), + .valid_i ( req_valid ), + .ready_o ( req_ready[i] ), + .data_i ( bank_req[i] ), + .valid_o ( bank_req_o[i] ), + .ready_i ( bank_gnt_i[i] ), + .data_o ( bank_oup[i] ) + ); + assign bank_addr_o[i] = bank_oup[i].addr; + assign bank_wdata_o[i] = bank_oup[i].wdata; + assign bank_strb_o[i] = bank_oup[i].strb; + assign bank_atop_o[i] = bank_oup[i].atop; + assign bank_we_o[i] = bank_oup[i].we; + end + + // Grant output if all our requests have been granted. + assign gnt_o = (&req_ready) & (&resp_ready); + + // Handle responses. + for (genvar i = 0; unsigned'(i) < NumBanks; i++) begin : gen_resp_regs + fall_through_register #( + .T ( oup_data_t ) + ) i_ft_reg ( + .clk_i, + .rst_ni, + .clr_i ( 1'b0 ), + .testmode_i ( 1'b0 ), + .valid_i ( bank_rvalid_i[i] ), + .ready_o ( resp_ready[i] ), + .data_i ( bank_rdata_i[i] ), + .data_o ( rdata_o[i*BitsPerBank+:BitsPerBank] ), + .ready_i ( rvalid_o ), + .valid_o ( resp_valid[i] ) + ); + end + assign rvalid_o = &resp_valid; + + // Assertions + // pragma translate_off + `ifndef VERILATOR + initial begin + assume (DataWidth != 0 && (DataWidth & (DataWidth - 1)) == 0) + else $fatal(1, "Data width must be a power of two!"); + assume (DataWidth % NumBanks == 0) + else $fatal(1, "Data width must be evenly divisible over banks!"); + assume ((DataWidth / NumBanks) % 8 == 0) + else $fatal(1, "Data width of each bank must be divisible into 8-bit bytes!"); + end + `endif + // pragma translate_on +endmodule diff --git a/hardware/tb/ara_tb.sv b/hardware/tb/ara_tb.sv index 30be2f091..53b2a3869 100644 --- a/hardware/tb/ara_tb.sv +++ b/hardware/tb/ara_tb.sv @@ -118,7 +118,7 @@ module ara_tb; if (address >= DRAMAddrBase && address < DRAMAddrBase + DRAMLength) // This requires the sections to be aligned to AxiWideByteOffset, // otherwise, they can be over-written. - dut.i_dram.init_val[(address - DRAMAddrBase + (w << AxiWideByteOffset)) >> AxiWideByteOffset] = mem_row; + dut.i_ara_soc.i_dram.init_val[(address - DRAMAddrBase + (w << AxiWideByteOffset)) >> AxiWideByteOffset] = mem_row; else $display("Cannot initialize address %x, which doesn't fall into the L2 region.", address); end diff --git a/hardware/tb/ara_testharness.sv b/hardware/tb/ara_testharness.sv index b19c2c855..23328b7b3 100644 --- a/hardware/tb/ara_testharness.sv +++ b/hardware/tb/ara_testharness.sv @@ -10,7 +10,6 @@ module ara_testharness #( // Ara-specific parameters parameter int unsigned NrLanes = 0, - parameter int unsigned NumWords = 2**21, // memory size // AXI Parameters parameter int unsigned AxiUserWidth = 1, parameter int unsigned AxiIdWidth = 6, @@ -23,7 +22,6 @@ module ara_testharness #( ); `include "axi/typedef.svh" - `include "common_cells/registers.svh" /***************** * Definitions * @@ -72,67 +70,21 @@ module ara_testharness #( .AxiIdWidth (AxiIdWidth ), .AxiUserWidth(AxiUserWidth ) ) i_ara_soc ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .exit_o (exit_o ), - .scan_enable_i (1'b0 ), - .scan_data_i (1'b0 ), - .scan_data_o (/* Unused */ ), + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .exit_o (exit_o ), + .scan_enable_i (1'b0 ), + .scan_data_i (1'b0 ), + .scan_data_o (/* Unused */), // UART - .uart_penable_o (uart_penable ), - .uart_pwrite_o (uart_pwrite ), - .uart_paddr_o (uart_paddr ), - .uart_psel_o (uart_psel ), - .uart_pwdata_o (uart_pwdata ), - .uart_prdata_i (uart_prdata ), - .uart_pready_i (uart_pready ), - .uart_pslverr_i (uart_pslverr ), - // AXI - .axi_aw_valid_o (dram_req.aw_valid ), - .axi_aw_id_o (dram_req.aw.id ), - .axi_aw_addr_o (dram_req.aw.addr ), - .axi_aw_len_o (dram_req.aw.len ), - .axi_aw_size_o (dram_req.aw.size ), - .axi_aw_burst_o (dram_req.aw.burst ), - .axi_aw_lock_o (dram_req.aw.lock ), - .axi_aw_cache_o (dram_req.aw.cache ), - .axi_aw_prot_o (dram_req.aw.prot ), - .axi_aw_qos_o (dram_req.aw.qos ), - .axi_aw_region_o(dram_req.aw.region), - .axi_aw_atop_o (dram_req.aw.atop ), - .axi_aw_user_o (dram_req.aw.user ), - .axi_aw_ready_i (dram_resp.aw_ready), - .axi_w_valid_o (dram_req.w_valid ), - .axi_w_data_o (dram_req.w.data ), - .axi_w_strb_o (dram_req.w.strb ), - .axi_w_last_o (dram_req.w.last ), - .axi_w_user_o (dram_req.w.user ), - .axi_w_ready_i (dram_resp.w_ready ), - .axi_b_valid_i (dram_resp.b_valid ), - .axi_b_id_i (dram_resp.b.id ), - .axi_b_resp_i (dram_resp.b.resp ), - .axi_b_user_i (dram_resp.b.user ), - .axi_b_ready_o (dram_req.b_ready ), - .axi_ar_valid_o (dram_req.ar_valid ), - .axi_ar_id_o (dram_req.ar.id ), - .axi_ar_addr_o (dram_req.ar.addr ), - .axi_ar_len_o (dram_req.ar.len ), - .axi_ar_size_o (dram_req.ar.size ), - .axi_ar_burst_o (dram_req.ar.burst ), - .axi_ar_lock_o (dram_req.ar.lock ), - .axi_ar_cache_o (dram_req.ar.cache ), - .axi_ar_prot_o (dram_req.ar.prot ), - .axi_ar_qos_o (dram_req.ar.qos ), - .axi_ar_region_o(dram_req.ar.region), - .axi_ar_user_o (dram_req.ar.user ), - .axi_ar_ready_i (dram_resp.ar_ready), - .axi_r_valid_i (dram_resp.r_valid ), - .axi_r_data_i (dram_resp.r.data ), - .axi_r_id_i (dram_resp.r.id ), - .axi_r_resp_i (dram_resp.r.resp ), - .axi_r_last_i (dram_resp.r.last ), - .axi_r_user_i (dram_resp.r.user ), - .axi_r_ready_o (dram_req.r_ready ) + .uart_penable_o(uart_penable), + .uart_pwrite_o (uart_pwrite ), + .uart_paddr_o (uart_paddr ), + .uart_psel_o (uart_psel ), + .uart_pwdata_o (uart_pwdata ), + .uart_prdata_i (uart_prdata ), + .uart_pready_i (uart_pready ), + .uart_pslverr_i(uart_pslverr) ); /********** @@ -152,58 +104,4 @@ module ara_testharness #( .pslverr_o(uart_pslverr) ); - /********** - * DRAM * - **********/ - - logic req; - logic we; - logic [AxiAddrWidth-1:0] addr; - logic [AxiDataWidth/8-1:0] be; - logic [AxiDataWidth-1:0] wdata; - logic [AxiDataWidth-1:0] rdata; - logic rvalid; - - axi_to_mem #( - .AddrWidth (AxiAddrWidth), - .DataWidth (AxiDataWidth), - .IdWidth (AxiIdWidth ), - .NumBanks (1 ), - .axi_req_t (axi_req_t ), - .axi_resp_t(axi_resp_t ) - ) i_axi_to_mem ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .axi_req_i (dram_req ), - .axi_resp_o (dram_resp ), - .mem_req_o (req ), - .mem_gnt_i (req ), // Always available - .mem_we_o (we ), - .mem_addr_o (addr ), - .mem_strb_o (be ), - .mem_wdata_o (wdata ), - .mem_rdata_i (rdata ), - .mem_rvalid_i(rvalid ), - .mem_atop_o (/* Unused */), - .busy_o (/* Unused */) - ); - - tc_sram #( - .NumWords (NumWords ), - .NumPorts (1 ), - .DataWidth(AxiDataWidth) - ) i_dram ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .req_i (req ), - .we_i (we ), - .addr_i (addr[$clog2(NumWords)-1+$clog2(AxiDataWidth/8):$clog2(AxiDataWidth/8)]), - .wdata_i(wdata ), - .be_i (be ), - .rdata_o(rdata ) - ); - - // One-cycle latency - `FF(rvalid, req, 1'b0); - endmodule : ara_testharness diff --git a/hardware/tb/verilator/ara_tb.cpp b/hardware/tb/verilator/ara_tb.cpp index 0bb037403..4b05125bc 100644 --- a/hardware/tb/verilator/ara_tb.cpp +++ b/hardware/tb/verilator/ara_tb.cpp @@ -26,7 +26,7 @@ int main(int argc, char **argv) { // Initialize the DRAM MemAreaLoc l2_mem = {.base=0x80000000, .size=0x00080000}; memutil.RegisterMemoryArea( - "ram", "TOP.ara_tb_verilator.dut.i_dram", 64*NR_LANES/2, &l2_mem); + "ram", "TOP.ara_tb_verilator.dut.i_ara_soc.i_dram", 64*NR_LANES/2, &l2_mem); simctrl.RegisterExtension(&memutil); simctrl.SetInitialResetDelay(5); From 48ed6200a76eae7f34b2de355cb18320b190f651 Mon Sep 17 00:00:00 2001 From: Matheus Cavalcante Date: Wed, 14 Jul 2021 20:48:30 +0200 Subject: [PATCH 17/20] [hardware] :bug: Delay memory requests if the `axi_inval_filter` is busy --- hardware/src/axi_inval_filter.sv | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/hardware/src/axi_inval_filter.sv b/hardware/src/axi_inval_filter.sv index 7fe2fb0e8..9c7e89916 100644 --- a/hardware/src/axi_inval_filter.sv +++ b/hardware/src/axi_inval_filter.sv @@ -66,7 +66,10 @@ module axi_inval_filter #( slv_resp_o = mst_resp_i; // Do not accept new AWs if FIFO is full - if (aw_fifo_full) slv_resp_o.aw_ready = 1'b0; + if (aw_fifo_full) begin + slv_resp_o.aw_ready = 1'b0; + mst_req_o.aw_valid = 1'b0; + end end /////////////////////// From b9118cca23a753c2f6e173c1bd443935954968eb Mon Sep 17 00:00:00 2001 From: Matheus Cavalcante Date: Wed, 14 Jul 2021 22:52:28 +0200 Subject: [PATCH 18/20] [hardware] Reduce size of the DRAM --- hardware/src/ara_soc.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hardware/src/ara_soc.sv b/hardware/src/ara_soc.sv index 6ac3e2a79..29538cb9e 100644 --- a/hardware/src/ara_soc.sv +++ b/hardware/src/ara_soc.sv @@ -17,7 +17,7 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #( parameter int unsigned AxiUserWidth = 1, parameter int unsigned AxiIdWidth = 6, // Main memory - parameter int unsigned L2NumWords = 2**21, + parameter int unsigned L2NumWords = 2**19, // Dependant parameters. DO NOT CHANGE! localparam type axi_data_t = logic [AxiDataWidth-1:0], localparam type axi_strb_t = logic [AxiDataWidth/8-1:0], From a2653cfb08dcd544339bcc394834a6c77a7c3d4f Mon Sep 17 00:00:00 2001 From: Matheus Cavalcante Date: Wed, 14 Jul 2021 22:53:59 +0200 Subject: [PATCH 19/20] [verilator] Ensure we can preload memories 512 bits wide --- .../patches/0001-tech-cells-generic-sram.patch | 14 +++++++------- .../cpp/dpi_memutil.cc | 12 ++++++------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/hardware/patches/0001-tech-cells-generic-sram.patch b/hardware/patches/0001-tech-cells-generic-sram.patch index fee825950..dc1e1a99b 100644 --- a/hardware/patches/0001-tech-cells-generic-sram.patch +++ b/hardware/patches/0001-tech-cells-generic-sram.patch @@ -32,7 +32,7 @@ index 53530e0..075dcea 100644 @@ -164,6 +168,23 @@ module tc_sram #( end // if !rst_ni end - + + `ifdef VERILATOR + for (genvar i = 0; i < NumPorts; i++) begin + // update value when write is set at clock @@ -86,9 +86,9 @@ index 53530e0..075dcea 100644 + // Function for setting a specific element in |sram| + // Returns 1 (true) for success, 0 (false) for errors. + export "DPI-C" function simutil_set_mem; -+ function int simutil_set_mem(input int index, input bit [255:0] val); -+ // Function will only work for memories <= 256 bits -+ if (DataWidth > 256) ++ function int simutil_set_mem(input int index, input bit [511:0] val); ++ // Function will only work for memories <= 512 bits ++ if (DataWidth > 512) + return 0; + if (index >= NumWords) + return 0; @@ -99,9 +99,9 @@ index 53530e0..075dcea 100644 + + // Function for getting a specific element in |sram| + export "DPI-C" function simutil_get_mem; -+ function int simutil_get_mem(input int index, output bit [255:0] val); -+ // Function will only work for memories <= 256 bits -+ if (DataWidth > 256) ++ function int simutil_get_mem(input int index, output bit [511:0] val); ++ // Function will only work for memories <= 512 bits ++ if (DataWidth > 512) + return 0; + if (index >= NumWords) + return 0; diff --git a/hardware/tb/verilator/lowrisc_dv_verilator_memutil_dpi/cpp/dpi_memutil.cc b/hardware/tb/verilator/lowrisc_dv_verilator_memutil_dpi/cpp/dpi_memutil.cc index 0434c45d7..a7a37927f 100644 --- a/hardware/tb/verilator/lowrisc_dv_verilator_memutil_dpi/cpp/dpi_memutil.cc +++ b/hardware/tb/verilator/lowrisc_dv_verilator_memutil_dpi/cpp/dpi_memutil.cc @@ -253,7 +253,7 @@ static void WriteSegment(const MemArea &m, uint32_t offset, "0x" << std::hex << m.addr_loc.size << " " << "write with offset: 0x" << std::hex << offset << " " << "write with size: 0x" << std::hex << data.size() << "\n"; - assert(m.width_byte <= 32); + assert(m.width_byte <= 64); assert(m.addr_loc.size == 0 || offset + data.size() <= m.addr_loc.size); assert((offset % m.width_byte) == 0); @@ -262,11 +262,11 @@ static void WriteSegment(const MemArea &m, uint32_t offset, SVScoped scoped(m.location.data()); // This "mini buffer" is used to transfer each write to SystemVerilog. It's - // not massively efficient, but doing so ensures that we pass 256 bits (32 + // not massively efficient, but doing so ensures that we pass 512 bits (64 // bytes) of initialised data each time. This is for simutil_set_mem (defined // in prim_util_memload.svh), whose "val" argument has SystemVerilog type bit - // [255:0]. - uint8_t minibuf[32]; + // [511:0]. + uint8_t minibuf[64]; memset(minibuf, 0, sizeof minibuf); assert(m.width_byte <= sizeof minibuf); @@ -421,8 +421,8 @@ bool DpiMemUtil::RegisterMemoryArea(const std::string name, const std::string location, size_t width_bit, const MemAreaLoc *addr_loc) { - assert((width_bit <= 256) && - "TODO: Memory loading only supported up to 256 bits."); + assert((width_bit <= 512) && + "TODO: Memory loading only supported up to 512 bits."); assert(width_bit % 8 == 0); // First, create and register the memory by name From 9aa4f1fc5bef5af42fd9f9817c58f3443b073386 Mon Sep 17 00:00:00 2001 From: Matheus Cavalcante Date: Wed, 14 Jul 2021 23:15:18 +0200 Subject: [PATCH 20/20] [CHANGELOG] Update Changelog --- CHANGELOG.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ade25ee7..1f66364c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,13 +6,29 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] +### Fixed + +- Fix calculation of `vstu`'s vector length +- Fix `vslideup` and `vslidedown` operand's vector length trimming +- Mute mask requests on idle lanes +- Mute instructions with vector length zero on the respective `lane_sequencer` and `operand_requester` +- Fix `simd_div`'s offset calculation +- Delay acknowledgment of memory requests if the `axi_inval_filter` is busy + ### Added - Format source files in the `apps` folder with clang-format by running `make format` +- Support for the `2_lanes`, `8_lanes`, and `16_lanes` configurations, besides the default `4_lanes` one ### Changed - Compile Verilator and Ara's verilated model with LLVM, for a faster compile time. +- Verilator updated to version v4.210. +- Verilation is done with a hierarchical verilation flow +- Replace `ara_soc`'s LLC with a simple main memory +- Reduce number of words on the main memory, for faster Verilation +- Update `common_cells` to v1.22.1 +- Update `axi` to v0.29.1 ## 2.0.0 - 2021-06-24