diff --git a/README.md b/README.md
index 030f99e..573adc8 100644
--- a/README.md
+++ b/README.md
@@ -49,9 +49,10 @@ You should then be ready to generate some text.
 cargo run --release -- --tokenizer-model /path/to/tokenizer.model --model-path /path/to/LLaMA/7B --param-path /path/to/LLaMA/7B/params.json --prompt "The meaning of life is"
 ```
 
-Right now it seems to use around ~25 gigabytes of memory for 7B and around ~50
-gigabytes for 13B. If you don't use OpenCL, then internally all parameters are
-cast to 32-bit floats.
+By default, it will use the weights in the precision they are in the source
+files. You can use `--f16` command line argument to cast the largest weight
+matrices to float16. Also, using OpenCL will also cast the weight matrices to
+float16.
 
 You can use `--temperature`, `--top-p` and `--top-k` to adjust token sampler
 settings.
diff --git a/src/rllama_main.rs b/src/rllama_main.rs
index cf30ddf..b8813e1 100644
--- a/src/rllama_main.rs
+++ b/src/rllama_main.rs
@@ -39,6 +39,9 @@ struct Cli {
     #[arg(long)]
     repetition_penalty: Option<f32>,
 
+    #[arg(long, action)]
+    f16: bool,
+
     #[cfg(feature = "opencl")]
     #[arg(long)]
     opencl_device: Option<usize>,
@@ -154,7 +157,7 @@ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
 
     let max_seq_len = cli.max_seq_len.unwrap_or(1024);
 
-    let data_settings = {
+    let mut data_settings = {
         #[cfg(feature = "opencl")]
         {
             if let Some(opencl) = opencl {
@@ -168,6 +171,10 @@ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
         DataSettings::new()
     };
 
+    if cli.f16 == true {
+        data_settings = data_settings.force_f16();
+    }
+
     pln!("Loading transformer weights from {}...", model_path);
     let tr = Transformer::from_unpickled(
         &unpickle_results,
diff --git a/src/tensor.rs b/src/tensor.rs
index de64a70..7dc4371 100644
--- a/src/tensor.rs
+++ b/src/tensor.rs
@@ -205,6 +205,11 @@ impl Tensor {
         }
     }
 
+    #[inline]
+    pub fn dtype(&self) -> TensorDType {
+        self.dtype
+    }
+
     pub fn from_unpickled<P: AsRef<Path>, S: AsRef<str>>(
         unpickled: &unpickler::Value,
         name: S,
@@ -1014,11 +1019,44 @@ impl Tensor {
         false
     }
 
-    #[cfg(feature = "opencl")]
+    #[cfg(not(feature = "opencl"))]
+    pub fn is_on_gpu(&self) -> bool {
+        false
+    }
+
     pub fn is_on_cpu(&self) -> bool {
         return !self.is_on_gpu();
     }
 
+    // Casts data type to whatever the other tensors data type is.
+    pub fn to_same_type(&self, other: &Tensor) -> Tensor {
+        let mut result = self.clone();
+        if result.dtype() == other.dtype() {
+            return result;
+        }
+        match other.dtype {
+            TensorDType::Float32 => self.to_f32(),
+            TensorDType::Float16 => self.to_f16(),
+        }
+    }
+
+    pub fn into_same_type(self, other: &Tensor) -> Tensor {
+        if self.dtype() == other.dtype() {
+            return self;
+        }
+        match other.dtype {
+            TensorDType::Float32 => self.to_f32(),
+            TensorDType::Float16 => self.to_f16(),
+        }
+    }
+
+    pub fn into_dtype(self, dtype: TensorDType) -> Tensor {
+        match dtype {
+            TensorDType::Float32 => self.to_f32(),
+            TensorDType::Float16 => self.to_f16(),
+        }
+    }
+
     #[cfg(feature = "opencl")]
     fn matrix_mul_inplace_transposed_gpu(&mut self, src: &Tensor, other: &Tensor) {
         let mut self_od = self.opencl_data.write().unwrap();
diff --git a/src/transformer.rs b/src/transformer.rs
index 3af2a0f..07205d6 100644
--- a/src/transformer.rs
+++ b/src/transformer.rs
@@ -39,6 +39,8 @@ pub struct DataSettings {
     use_opencl_for_attention: bool,
     #[cfg(feature = "opencl")]
     cl: Option<OpenCL>,
+
+    force_f16: bool,
 }
 
 // OpenCL is safe to send to threads but Rust doesn't know that
@@ -51,13 +53,14 @@ impl DataSettings {
         DataSettings {
             use_opencl_for_feedforward: false,
             use_opencl_for_attention: false,
+            force_f16: false,
             cl: cl.clone(),
         }
     }
 
     #[cfg(not(feature = "opencl"))]
     pub fn new() -> Self {
-        DataSettings {}
+        DataSettings { force_f16: false }
     }
 
     #[cfg(feature = "opencl")]
@@ -69,6 +72,11 @@ impl DataSettings {
         self.use_opencl_for_attention = true;
         self
     }
+
+    pub fn force_f16(mut self) -> DataSettings {
+        self.force_f16 = true;
+        self
+    }
 }
 
 pub struct TransformerCaches {
@@ -400,6 +408,12 @@ impl FeedForward {
             FromPiecesDirection::Rows,
         )?;
 
+        if data_settings.force_f16 {
+            w1 = w1.to_f16();
+            w2 = w2.to_f16();
+            w3 = w3.to_f16();
+        }
+
         #[cfg(feature = "opencl")]
         {
             if data_settings.use_opencl_for_feedforward {
@@ -412,12 +426,7 @@ impl FeedForward {
                 w3.to_gpu_inplace(&ds.cl.unwrap()).unwrap();
             }
         }
-        #[cfg(not(feature = "opencl"))]
-        {
-            w1 = w1.to_f32();
-            w2 = w2.to_f32();
-            w3 = w3.to_f32();
-        }
+        // w1, w2, w3 maybe be f32 or f16 depending on source data.
 
         Ok(Self {
             w1,
@@ -428,31 +437,40 @@ impl FeedForward {
     }
 
     pub fn forward(&self, x: &mut Tensor) -> Tensor {
+        let original_x_dtype = x.dtype();
+        if x.dtype() != self.w1.dtype() {
+            *x = x.to_same_type(&self.w1);
+        }
         #[cfg(feature = "opencl")]
         let x_was_on_cpu: bool;
         #[cfg(feature = "opencl")]
         {
             x_was_on_cpu = x.is_on_cpu();
             if self.data_settings.use_opencl_for_feedforward {
-                *x = x.to_f16();
                 x.to_gpu_inplace(self.data_settings.cl.as_ref().unwrap())
                     .unwrap();
             }
         }
-        let (w1_out, w3_out) = rayon::join(
+        let (mut w1_out, mut w3_out) = rayon::join(
             || self.w1.matrix_mul_transposed(x),
             || self.w3.matrix_mul_transposed(x),
         );
-        let w1_out = w1_out.silu();
-        let w1w3_out = w1_out.hadamard_product(&w3_out).transpose();
 
+        // Float16 not supported for some of these ops on CPU.
+        if w1_out.is_on_cpu() && w1_out.dtype() == TensorDType::Float16 {
+            w1_out = w1_out.to_f32();
+            w3_out = w3_out.to_f32();
+        }
+        let w1_out = w1_out.silu();
+        let mut w1w3_out = w1_out.hadamard_product(&w3_out).transpose();
+        if w1w3_out.dtype() != self.w2.dtype() {
+            w1w3_out = w1w3_out.to_same_type(&self.w2);
+        }
         #[cfg(not(feature = "opencl"))]
-        if w1w3_out.rows() == 1 {
-            self
-                .w2
-                .matrix_vector_mul_transposed_multithreaded(&w1w3_out)
-        } else {
-            self.w2.matrix_mul_transposed(&w1w3_out)
+        {
+            self.w2
+                .matrix_mul_transposed(&w1w3_out)
+                .into_dtype(original_x_dtype)
         }
         #[cfg(feature = "opencl")]
         {
@@ -503,6 +521,13 @@ impl Attention {
             FromPiecesDirection::Cols,
         )?;
 
+        if data_settings.force_f16 {
+            wq = wq.to_f16();
+            wk = wk.to_f16();
+            wv = wv.to_f16();
+            wo = wo.to_f16();
+        }
+
         #[cfg(feature = "opencl")]
         {
             if data_settings.use_opencl_for_attention {
@@ -517,13 +542,6 @@ impl Attention {
                 wo.to_gpu_inplace(&ds.cl.unwrap()).unwrap();
             }
         }
-        #[cfg(not(feature = "opencl"))]
-        {
-            wq = wq.to_f32();
-            wk = wk.to_f32();
-            wv = wv.to_f32();
-            wo = wo.to_f32();
-        }
 
         Ok(Self {
             wq,
@@ -544,13 +562,17 @@ impl Attention {
         mask: &Option<Tensor>,
         attention_cache: &mut AttentionCache,
     ) -> Tensor {
+        let original_x_dtype = x.dtype();
+        if x.dtype() != self.wq.dtype() {
+            *x = x.to_same_type(&self.wq);
+        }
+
         #[cfg(feature = "opencl")]
         let x_was_on_cpu: bool;
         #[cfg(feature = "opencl")]
         {
             x_was_on_cpu = x.is_on_cpu();
             if self.data_settings.use_opencl_for_attention {
-                *x = x.to_f16();
                 x.to_gpu_inplace(self.data_settings.cl.as_ref().unwrap())
                     .unwrap();
             }
@@ -570,11 +592,11 @@ impl Attention {
 
         #[cfg(not(feature = "opencl"))]
         let (xq_out, (xk_out, xv_out)) = rayon::join(
-            || x.matrix_mul_transposed(&self.wq),
+            || x.matrix_mul_transposed(&self.wq).to_f32(),
             || {
                 rayon::join(
-                    || x.matrix_mul_transposed(&self.wk),
-                    || x.matrix_mul_transposed(&self.wv),
+                    || x.matrix_mul_transposed(&self.wk).to_f32(),
+                    || x.matrix_mul_transposed(&self.wv).to_f32(),
                 )
             },
         );
@@ -666,7 +688,9 @@ impl Attention {
                 #[cfg(not(feature = "opencl"))]
                 {
                     let xq_row = Tensor::concat(&concat_vec2).view(1, self.wo.rows());
-                    xq_row.matrix_mul_transposed(&self.wo)
+                    xq_row
+                        .into_same_type(&self.wo)
+                        .matrix_mul_transposed(&self.wo)
                 }
                 #[cfg(feature = "opencl")]
                 {
@@ -689,7 +713,7 @@ impl Attention {
 
         let output3: Vec<&Tensor> = output2.iter().collect();
         let output2: Tensor = Tensor::concat(&output3);
-        output2
+        output2.into_dtype(original_x_dtype)
     }
 }