ComfyUI and Deepseek’s Janus Pro 7B or 1B “cuda” error Fix for Mac
Janus Pro 7B and 1B models are new image generation model. The default available ComfyUI setup from Github is set to use graphic card that has cuda cores. However, if you are an owner of 2020 or newer model Mac, this will keep giving you error related to “cuda.” The default setup doesn’t have a fallback, so as soon as it doesn’t locate a supported graphic card, it stops.
How to fix this?
To fix this issue, you need to use a fallback, if “cuda” isn’t available, make it try “mps” Mac’s internal graphic card. If both aren’t available, make it use CPU.
- Go to your ComfyUI folder and then custom_nodes.
- Inside the nodes folder, you will find 3 files model_loader.py, image_understanding.py, image_generation.py
- Open those files one by one in a code editor app, e.g. Visual Studio.
Replace your full model_loader.py with the following code
import os
import torch
class JanusModelLoader:
def __init__(self):
pass
@classmethod
def INPUT_TYPES(s):
return {
"required": {
"model_name": (["deepseek-ai/Janus-Pro-1B", "deepseek-ai/Janus-Pro-7B"],),
},
}
RETURN_TYPES = ("JANUS_MODEL", "JANUS_PROCESSOR")
RETURN_NAMES = ("model", "processor")
FUNCTION = "load_model"
CATEGORY = "Janus-Pro"
def load_model(self, model_name):
"""
Load Janus model automatically on CUDA, MPS (Apple Silicon), or CPU,
and adjust precision (bfloat16/float16/float32) as needed.
"""
try:
from janus.models import MultiModalityCausalLM, VLChatProcessor
from transformers import AutoModelForCausalLM
except ImportError:
raise ImportError("Please install Janus using 'pip install -r requirements.txt'")
# Determine device: CUDA -> MPS -> CPU
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
else:
device = torch.device("cpu")
# Choose dtype
# - If device is CUDA, try bfloat16, else fall back to float16.
# - If device is MPS or CPU, default to float32 for safety.
if device.type == "cuda":
try:
dtype = torch.bfloat16
# Test by creating a small tensor
_ = torch.zeros(1, dtype=dtype, device=device)
except RuntimeError:
dtype = torch.float16
elif device.type == "mps":
# As of PyTorch 2.x, MPS works best with float32
dtype = torch.float32
else:
# CPU fallback
dtype = torch.float32
# Get ComfyUI root directory (assuming four levels up from __file__)
comfy_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
# Build local model path
model_dir = os.path.join(
comfy_path,
"models",
"Janus-Pro",
os.path.basename(model_name)
)
if not os.path.exists(model_dir):
raise ValueError(
f"Local model not found at {model_dir}. "
"Please download the model and place it in "
"the ComfyUI/models/Janus-Pro folder."
)
# Load processor
vl_chat_processor = VLChatProcessor.from_pretrained(model_dir)
# Load model
vl_gpt = AutoModelForCausalLM.from_pretrained(
model_dir,
trust_remote_code=True
)
# Move model to device + correct dtype
vl_gpt = vl_gpt.to(dtype=dtype, device=device).eval()
return (vl_gpt, vl_chat_processor)
Replace image_understanding.py code with the follow
import torch
import numpy as np
from PIL import Image
class JanusImageUnderstanding:
@classmethod
def INPUT_TYPES(s):
return {
"required": {
"model": ("JANUS_MODEL",),
"processor": ("JANUS_PROCESSOR",),
"image": ("IMAGE",),
"question": ("STRING", {
"multiline": True,
"default": "Describe this image in detail."
}),
"seed": ("INT", {
"default": 666666666666666,
"min": 0,
"max": 0xffffffffffffffff
}),
"temperature": ("FLOAT", {
"default": 0.1,
"min": 0.0,
"max": 1.0
}),
"top_p": ("FLOAT", {
"default": 0.95,
"min": 0.0,
"max": 1.0
}),
"max_new_tokens": ("INT", {
"default": 512,
"min": 1,
"max": 2048
}),
},
}
RETURN_TYPES = ("STRING",)
RETURN_NAMES = ("text",)
FUNCTION = "analyze_image"
CATEGORY = "Janus-Pro"
def analyze_image(self, model, processor, image, question, seed, temperature, top_p, max_new_tokens):
"""
Perform image understanding using Janus on CPU, CUDA (if available), or MPS (for Apple Silicon).
"""
try:
from janus.models import MultiModalityCausalLM
except ImportError:
raise ImportError("Please install Janus using 'pip install -r requirements.txt'")
# Determine the device: CUDA -> MPS -> CPU
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
else:
device = torch.device("cpu")
# Set random seed
torch.manual_seed(seed)
if device.type == "cuda":
torch.cuda.manual_seed(seed)
# For MPS/CPU, there is no separate manual seed function
# Move model to the chosen device (if needed).
# If your model is already on the correct device, you can remove this line:
model.to(device)
# Handle input image from ComfyUI (BCHW). We want a single RGB image [H, W, C].
if len(image.shape) == 4 and image.shape[0] == 1: # [1, C, H, W] → [C, H, W]
image = image.squeeze(0)
# Ensure [0,1] range, then convert to [0,255] uint8
image = (torch.clamp(image, 0, 1) * 255).cpu().numpy().astype(np.uint8)
# Convert to PIL
pil_image = Image.fromarray(image, mode='RGB')
conversation = [
{
"role": "<|User|>",
"content": f"<image_placeholder>\n{question}",
"images": [pil_image],
},
{"role": "<|Assistant|>", "content": ""},
]
# Prepare tokenized inputs
prepare_inputs = processor(
conversations=conversation,
images=[pil_image],
force_batchify=True
)
# Move prepared inputs to the same device as the model
prepare_inputs = prepare_inputs.to(device)
# Prepare input embeddings
inputs_embeds = model.prepare_inputs_embeds(**prepare_inputs)
# Generate answer
outputs = model.language_model.generate(
inputs_embeds=inputs_embeds,
attention_mask=prepare_inputs.attention_mask,
pad_token_id=processor.tokenizer.eos_token_id,
bos_token_id=processor.tokenizer.bos_token_id,
eos_token_id=processor.tokenizer.eos_token_id,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=temperature,
top_p=top_p,
use_cache=True,
)
# Decode the answer
answer = processor.tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
return (answer,)
@classmethod
def IS_CHANGED(cls, seed, **kwargs):
return seed
And, replace image_generation.py code with this code
import torch
import numpy as np
from PIL import Image
class JanusImageGeneration:
@classmethod
def INPUT_TYPES(s):
return {
"required": {
"model": ("JANUS_MODEL",),
"processor": ("JANUS_PROCESSOR",),
"prompt": ("STRING", {
"multiline": True,
"default": "A beautiful photo of"
}),
"seed": ("INT", {
"default": 666666666666666,
"min": 0,
"max": 0xffffffffffffffff
}),
"batch_size": ("INT", {
"default": 1,
"min": 1,
"max": 16
}),
"cfg_weight": ("FLOAT", {
"default": 5.0,
"min": 1.0,
"max": 10.0,
"step": 0.5
}),
"temperature": ("FLOAT", {
"default": 1.0,
"min": 0.1,
"max": 2.0,
"step": 0.1
}),
"top_p": ("FLOAT", {
"default": 0.95,
"min": 0.0,
"max": 1.0
}),
},
}
RETURN_TYPES = ("IMAGE",)
RETURN_NAMES = ("images",)
FUNCTION = "generate_images"
CATEGORY = "Janus-Pro"
def generate_images(self, model, processor, prompt, seed, batch_size=1, temperature=1.0, cfg_weight=5.0, top_p=0.95):
"""
Generate images using Janus on CPU, CUDA (if available), or MPS (for Apple Silicon).
"""
try:
from janus.models import MultiModalityCausalLM
except ImportError:
raise ImportError("Please install Janus using 'pip install -r requirements.txt'")
# Determine the device: CUDA -> MPS -> CPU
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
else:
device = torch.device("cpu")
# Set random seed
torch.manual_seed(seed)
if device.type == "cuda":
torch.cuda.manual_seed(seed)
# Note: MPS backend does not have a separate manual_seed call
# Move model to device (if needed)
# If your model is already on the correct device externally, you may remove this line.
model.language_model.to(device)
model.gen_vision_model.to(device)
# Image parameter settings
image_token_num = 576 # 24x24 patches
img_size = 384 # Output image size
patch_size = 16 # Each patch size
parallel_size = batch_size
# Prepare conversation format
conversation = [
{
"role": "<|User|>",
"content": prompt,
},
{
"role": "<|Assistant|>",
"content": "",
},
]
# Prepare prompt input
sft_format = processor.apply_sft_template_for_multi_turn_prompts(
conversations=conversation,
sft_format=processor.sft_format,
system_prompt="",
)
prompt = sft_format + processor.image_start_tag
# Encode text input
input_ids = processor.tokenizer.encode(prompt)
input_ids = torch.LongTensor(input_ids)
# Create condition/unconditional tokens on the chosen device
tokens = torch.zeros((parallel_size * 2, len(input_ids)), dtype=torch.int, device=device)
for i in range(parallel_size * 2):
tokens[i, :] = input_ids
if i % 2 != 0: # uncond input
tokens[i, 1:-1] = processor.pad_id
# Get text embeddings
inputs_embeds = model.language_model.get_input_embeddings()(tokens)
# Prepare a placeholder for generated image tokens
generated_tokens = torch.zeros((parallel_size, image_token_num), dtype=torch.int, device=device)
outputs = None
# Autoregressive generation
for i in range(image_token_num):
outputs = model.language_model.model(
inputs_embeds=inputs_embeds,
use_cache=True,
past_key_values=outputs.past_key_values if i != 0 else None
)
hidden_states = outputs.last_hidden_state
# Get logits and apply CFG
logits = model.gen_head(hidden_states[:, -1, :])
logit_cond = logits[0::2, :]
logit_uncond = logits[1::2, :]
logits = logit_uncond + cfg_weight * (logit_cond - logit_uncond)
probs = torch.softmax(logits / temperature, dim=-1)
# Sample next token
next_token = torch.multinomial(probs, num_samples=1)
generated_tokens[:, i] = next_token.squeeze(dim=-1)
# Prepare next step input
# We expand next_token so that cond/uncond alignment remains correct
next_token = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
img_embeds = model.prepare_gen_img_embeds(next_token).to(device)
inputs_embeds = img_embeds.unsqueeze(dim=1)
# Decode generated tokens into images
dec = model.gen_vision_model.decode_code(
generated_tokens.to(dtype=torch.int),
shape=[parallel_size, 8, img_size // patch_size, img_size // patch_size]
)
# Convert to float32 on CPU for post-processing
dec = dec.to(torch.float32).cpu().numpy()
# Ensure BCHW with 3 channels
if dec.shape[1] != 3:
dec = np.repeat(dec, 3, axis=1)
# Map from [-1,1] to [0,1]
dec = (dec + 1) / 2
# Clip to [0,1]
dec = np.clip(dec, 0, 1)
# Reorder for ComfyUI [B,C,H,W] -> [B,H,W,C]
dec = np.transpose(dec, (0, 2, 3, 1))
# Convert back to torch tensor
images = torch.from_numpy(dec).float()
# Final shape check
assert images.ndim == 4 and images.shape[-1] == 3, f"Unexpected shape: {images.shape}"
return (images,)
@classmethod
def IS_CHANGED(cls, seed, **kwargs):
return seed
You can also download this image and drag to ComfyUI for workflow
Is it worth it?
No, I don’t recommend. It took me about 15 minutes I think to generate this little size of image making RAM usage . LOL! Why to go through so much of hassle when you have much better option.
Use Flux On Mac with in-built Graphic
Flux 1.1 dev with ComfyUI and Flux Schnell setup works like charm using 4-step method only (quick and efficient). It takes 3 to 5 minutes of my Mac with M1 Max CPU and 64GB RAM. to generate a 1024 x 1024 size image. The quality of the image is so much better. I don’t think there is any other model that can beat Flux except Flux 1.1 Pro and Flux 1.1 Pro Ultra itself. Surprisingly, this is the only model I found that also write text accurately on image (Almost 80% of the time).