В общем случае наверно так
module shift_load_rga
#(
parameter N = 4,
parameter WIDTH = 8
)
(
input clk,
input sload,
input [N-1:0] in,
output [WIDTH-1:0] out
);
reg [WIDTH-1:0] in_rg [N-1:0];
integer i;
always @(posedge clk)
begin
for(i = 0; i < N; i = i + 1)
if (sload == 1'b0) in_rg[i] <= {in_rg[i][WIDTH-2:0], in[i]};
else begin
if (i==0) in_rg[i] <= in[i];
//if (i==0) in_rg[i] <= in_rg[i];
else in_rg[i] <= in_rg[i-1]; end
end
assign out = in_rg[N-1];
endmodule