Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

How to fix this issue for C# Winforms project #242

Open
zydjohnHotmail opened this issue Jan 5, 2025 · 2 comments
Open

How to fix this issue for C# Winforms project #242

zydjohnHotmail opened this issue Jan 5, 2025 · 2 comments

Comments

@zydjohnHotmail
Copy link

Hi, I always get the same run time errors: Image tensor dimensions: 1, 1, 3, 480, 854
Image size tensor dimensions: 1, 2
Exception thrown: 'Microsoft.ML.OnnxRuntime.OnnxRuntimeException' in Microsoft.ML.OnnxRuntime.dll
[ErrorCode:RuntimeException] Non-zero status code returned while running Add node. Name:'/img_processor/vision_model/embeddings/Add' Status Message: D:\a_work\1\s\onnxruntime\core/providers/cpu/math/element_wise_ops.h:560 onnxruntime::BroadcastIterator::Append axis == 1 || axis == largest was false. Attempting to broadcast an axis by a dimension other than 1. 577 by 2075
=> I write one WinForms app project, I want to use Phi-3 model to describe the image, but my image size is different: 854 by 480 pixels.
Here is my form1.cs code: using System;
using System.Diagnostics;
using System.Drawing;
using System.Drawing.Drawing2D;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
using Newtonsoft.Json.Linq;

namespace TestPhi3Form
{
public partial class Form1 : Form
{
private string visionModelPath =
@"D:\Phi3\Phi3-vision-128k-instruct-onnx-cpu\Phi3-vision-128k-instruct-onnx-cpu\phi-3-v-128k-instruct-vision.onnx";
private string textModelPath =
@"D:\Phi3\Phi3-vision-128k-instruct-onnx-cpu\Phi3-vision-128k-instruct-onnx-cpu\phi-3-v-128k-instruct-text.onnx";
private string tokenPath =
@"D:\Phi3\Phi3-vision-128k-instruct-onnx-cpu\Phi3-vision-128k-instruct-onnx-cpu\tokenizer.json";

    private Dictionary<int, string> tokenIdToTextMap;

    private InferenceSession visionSession;
    private InferenceSession textSession;

    public Form1()
    {
        InitializeComponent();
        visionSession = new InferenceSession(visionModelPath);
        textSession = new InferenceSession(textModelPath);
        LoadTokenizerMapping(tokenPath);
    }

    private void LoadTokenizerMapping(string tokenizerJsonPath)
    {
        // Read the tokenizer.json file
        var json = File.ReadAllText(tokenizerJsonPath);
        var tokenizerConfig = JObject.Parse(json);

        // Extract the "added_tokens" section
        var addedTokens = tokenizerConfig["added_tokens"];

        // Create a dictionary to map token IDs to their text
        tokenIdToTextMap = new Dictionary<int, string>();
        foreach (var token in addedTokens)
        {
            int id = token["id"].ToObject<int>();
            string content = token["content"].ToString();
            tokenIdToTextMap[id] = content;
        }
    }

    private Tensor<float> PreprocessImage(Image image)
    {
        // Resize the image to the required input size (e.g., 224x224)
        const int targetWidth = 224;
        const int targetHeight = 224;
        Bitmap resizedImage = new Bitmap(image, new Size(targetWidth, targetHeight));

        // Convert the image to a tensor with an additional dimension
        var tensor = new DenseTensor<float>(new[] { 1, 1, 3, targetHeight, targetWidth });

        for (int y = 0; y < targetHeight; y++)
        {
            for (int x = 0; x < targetWidth; x++)
            {
                Color pixel = resizedImage.GetPixel(x, y);

                // Normalize RGB values to [0, 1] range
                tensor[0, 0, 0, y, x] = pixel.R / 255f;
                tensor[0, 0, 1, y, x] = pixel.G / 255f;
                tensor[0, 0, 2, y, x] = pixel.B / 255f;
            }
        }
        return tensor;
    }

    private Tensor<float> PreprocessImage(string imagePath)
    {
        // Load the image
        using var bitmap = new Bitmap(imagePath);

        // Ensure the image is of size 854x480
        const int targetWidth = 854;
        const int targetHeight = 480;

        // Convert to a Tensor<float> in NCHW format (batch, channels, height, width)
        var tensor = new DenseTensor<float>(new[] { 1, 3, targetHeight, targetWidth }); // Batch, Channels, Height, Width
        for (int y = 0; y < targetHeight; y++)
        {
            for (int x = 0; x < targetWidth; x++)
            {
                var pixel = bitmap.GetPixel(x, y);

                // Normalize pixel values to [0, 1] and populate the tensor
                tensor[0, 0, y, x] = pixel.R / 255f; // Red channel
                tensor[0, 1, y, x] = pixel.G / 255f; // Green channel
                tensor[0, 2, y, x] = pixel.B / 255f; // Blue channel
            }
        }

        // Add an additional dimension to match the expected rank of 5
        var expandedTensor = tensor.Reshape(new[] { 1, 1, 3, targetHeight, targetWidth });

        return expandedTensor;
    }

    private List<NamedOnnxValue> PrepareVisionInputs(string imagePath)
    {
        // Preprocess the image
        var imageTensor = PreprocessImage(imagePath);

        // Prepare the image size tensor (Height and Width)
        var imageSizeTensor = new DenseTensor<long>(new[] { 1, 2 });
        imageSizeTensor[0, 0] = 480; // Height
        imageSizeTensor[0, 1] = 854; // Width

        // Debugging: Print the tensor dimensions and types
        Debug.Print($"Image tensor dimensions: {string.Join(", ", imageTensor.Dimensions.ToArray())}");
        Debug.Print($"Image size tensor dimensions: {string.Join(", ", imageSizeTensor.Dimensions.ToArray())}");

        // Create inputs for the vision model
        var inputs = new List<NamedOnnxValue>
        {
        NamedOnnxValue.CreateFromTensor("pixel_values", imageTensor),
        NamedOnnxValue.CreateFromTensor("image_sizes", imageSizeTensor)
        };

        return inputs;
    }

    private List<NamedOnnxValue> PrepareTextInputs(Tensor<float> visualFeatures)
    {
        // Create a dummy attention mask (adjust based on your model's requirements)
        var attentionMask = new DenseTensor<float>(new[] { 1, visualFeatures.Dimensions[1] });

        // Create inputs for the text model
        var inputs = new List<NamedOnnxValue>
        {
           NamedOnnxValue.CreateFromTensor("inputs_embeds", visualFeatures),
           NamedOnnxValue.CreateFromTensor("attention_mask", attentionMask)
        };

        return inputs;
    }


    private string GenerateResponse(Tensor<float> visionOutput)
    {
        // Prepare inputs for the text model
        var inputs = new List<NamedOnnxValue>
        {
            NamedOnnxValue.CreateFromTensor("input", visionOutput)
        };

        // Run the text model
        using (var results = textSession.Run(inputs))
        {
            var output = results.First().AsTensor<float>();
            return ProcessTextOutput(output);
        }
    }

    private string ProcessTextOutput(Tensor<float> output)
    {
        // Convert the float array to a byte array
        var floatArray = output.ToArray();
        var byteArray = new byte[floatArray.Length * sizeof(float)];
        Buffer.BlockCopy(floatArray, 0, byteArray, 0, byteArray.Length);

        // Convert the byte array to a string
        var text = Encoding.UTF8.GetString(byteArray);
        return text;
    }


    private string DecodeTokenIds(int[] tokenIds)
    {
        // Decode the token IDs into text using the tokenizer mapping
        var text = string.Join("", tokenIds.Select(id => tokenIdToTextMap.ContainsKey(id) ? tokenIdToTextMap[id] : $"<unk:{id}>"));
        return text;
    }

    private void DetectObjects(string imagePath)
    {
        try
        {
            // Preprocess the image
            var imageTensor = PreprocessImage(imagePath);

            // Prepare inputs for the vision model
            var visionInputs = PrepareVisionInputs(imagePath);

            // Run the vision model
            var visualFeatures = RunVisionModel(visionInputs);

            // Prepare inputs for the text model
            var textInputs = PrepareTextInputs(visualFeatures);

            // Run the text model
            var response = RunTextModel(textInputs);

            // Display the response
            MessageBox.Show($"Model Response: {response}");
        }
        catch (Exception ex)
        {
            Debug.Print(ex.Message.ToString());
            MessageBox.Show($"Error during inference: {ex.Message}");
        }
    }


    private string RunTextModel(List<NamedOnnxValue> inputs)
    {
        using (var results = textSession.Run(inputs))
        {
            var logits = results.First().AsTensor<float>();

            // Decode the logits into text (adjust based on your tokenizer)
            var tokenIds = logits.ToArray().Select(x => (int)x).ToArray();
            var text = DecodeTokenIds(tokenIds);
            return text;
        }
    }


    private Tensor<float> RunVisionModel(List<NamedOnnxValue> inputs)
    {
        using (var results = visionSession.Run(inputs))
        {
            var visualFeatures = results.First().AsTensor<float>();
            return visualFeatures;
        }
    }


    private Tensor<float> ConvertImageToTensor(Image image)
    {
        // Resize the image to the model's input size (e.g., 224x224)
        const int targetWidth = 224;
        const int targetHeight = 224;

        Bitmap resizedImage = new Bitmap(image, new Size(targetWidth, targetHeight));
        var tensor = new DenseTensor<float>(new[] { 1, 3, targetHeight, targetWidth });

        for (int y = 0; y < targetHeight; y++)
        {
            for (int x = 0; x < targetWidth; x++)
            {
                Color pixel = resizedImage.GetPixel(x, y);

                // Normalize RGB values to [0, 1] range
                tensor[0, 0, y, x] = pixel.R / 255f;
                tensor[0, 1, y, x] = pixel.G / 255f;
                tensor[0, 2, y, x] = pixel.B / 255f;
            }
        }

        return tensor;
    }

    private (List<Rectangle>, List<string>) ProcessOutput(float[] output)
    {
        var boxes = new List<Rectangle>();
        var descriptions = new List<string>();

        // Example logic for processing output (adjust based on your model)
        int numObjects = output.Length / 6; // Assuming 6 values per object: x, y, width, height, classId, confidence
        for (int i = 0; i < numObjects; i++)
        {
            int offset = i * 6;
            float x = output[offset] * PBImage.Width;
            float y = output[offset + 1] * PBImage.Height;
            float width = output[offset + 2] * PBImage.Width;
            float height = output[offset + 3] * PBImage.Height;
            int classId = (int)output[offset + 4];
            float confidence = output[offset + 5];

            if (confidence > 0.5) // Confidence threshold
            {
                boxes.Add(new Rectangle((int)x, (int)y, (int)width, (int)height));
                descriptions.Add($"Object {classId} with confidence {confidence:P}");
            }
        }

        return (boxes, descriptions);
    }

    private void DrawBoundingBoxes(List<Rectangle> boxes, List<string> descriptions)
    {
        if (PBImage.Image == null)
            return;

        Image image = (Image)PBImage.Image.Clone();
        using (Graphics g = Graphics.FromImage(image))
        {
            Pen pen = new Pen(Color.Red, 2);
            Font font = new Font("Arial", 10);
            Brush brush = new SolidBrush(Color.Yellow);

            for (int i = 0; i < boxes.Count; i++)
            {
                g.DrawRectangle(pen, boxes[i]);
                g.DrawString(descriptions[i], font, brush, boxes[i].Location);
            }
        }

        PBImage.Image = image;
    }


    private void BTNInfer_Click(object sender, EventArgs e)
    {
        using (OpenFileDialog openFileDialog = new OpenFileDialog())
        {
            openFileDialog.Filter = "Image Files|*.png";

            if (openFileDialog.ShowDialog() == DialogResult.OK)
            {
                string filePath = openFileDialog.FileName;
                PBImage.Image = Image.FromFile(filePath);

                DetectObjects(filePath);
            }
        }
    }

    private void BTNFinish_Click(object sender, EventArgs e)
    {
        Console.Beep();
        Environment.Exit(0);
    }
}

}
=> I want to know how to run my program, so I can click on BTNInfer button to open any image file in my PC and let Phi-3 to describe what it can see in the image.
Please advise,
Thanks,

@leestott
Copy link
Contributor

leestott commented Jan 8, 2025

Here are some steps and tips to help you resolve this issue:

1. Tensor Dimensions

The error message indicates a mismatch in tensor dimensions. Specifically, the model expects a tensor of dimensions [1, 3, 480, 854] but is receiving [1, 1, 3, 480, 854]. You need to ensure that the tensor dimensions match the model's expectations.

2. Preprocess Image Function

Update your PreprocessImage function to ensure it returns a tensor with the correct dimensions:

private Tensor<float> PreprocessImage(string imagePath)
{
    // Load the image
    using var bitmap = new Bitmap(imagePath);

    // Ensure the image is of size 854x480
    const int targetWidth = 854;
    const int targetHeight = 480;

    // Convert to a Tensor<float> in NCHW format (batch, channels, height, width)
    var tensor = new DenseTensor<float>(new[] { 1, 3, targetHeight, targetWidth }); // Batch, Channels, Height, Width
    for (int y = 0; y < targetHeight; y++)
    {
        for (int x = 0; x < targetWidth; x++)
        {
            var pixel = bitmap.GetPixel(x, y);

            // Normalize pixel values to [0, 1] and populate the tensor
            tensor[0, 0, y, x] = pixel.R / 255f; // Red channel
            tensor[0, 1, y, x] = pixel.G / 255f; // Green channel
            tensor[0, 2, y, x] = pixel.B / 255f; // Blue channel
        }
    }

    return tensor;
}

3. Prepare Vision Inputs

Ensure that the PrepareVisionInputs function does not add an extra dimension:

private List<NamedOnnxValue> PrepareVisionInputs(string imagePath)
{
    // Preprocess the image
    var imageTensor = PreprocessImage(imagePath);

    // Prepare the image size tensor (Height and Width)
    var imageSizeTensor = new DenseTensor<long>(new[] { 1, 2 });
    imageSizeTensor[0, 0] = 480; // Height
    imageSizeTensor[0, 1] = 854; // Width

    // Debugging: Print the tensor dimensions and types
    Debug.Print($"Image tensor dimensions: {string.Join(", ", imageTensor.Dimensions.ToArray())}");
    Debug.Print($"Image size tensor dimensions: {string.Join(", ", imageSizeTensor.Dimensions.ToArray())}");

    // Create inputs for the vision model
    var inputs = new List<NamedOnnxValue>
    {
        NamedOnnxValue.CreateFromTensor("pixel_values", imageTensor),
        NamedOnnxValue.CreateFromTensor("image_sizes", imageSizeTensor)
    };

    return inputs;
}

4. Debugging

Add debugging statements to ensure the dimensions of your tensors are correct before passing them to the model:

Debug.Print($"Image tensor dimensions: {string.Join(", ", imageTensor.Dimensions.ToArray())}");
Debug.Print($"Image size tensor dimensions: {string.Join(", ", imageSizeTensor.Dimensions.ToArray())}");

5. Running the Application

Ensure that your BTNInfer_Click method is correctly set up to handle the image file selection and model inference:

private void BTNInfer_Click(object sender, EventArgs e)
{
    using (OpenFileDialog openFileDialog = new OpenFileDialog())
    {
        openFileDialog.Filter = "Image Files|*.png;*.jpg;*.jpeg";

        if (openFileDialog.ShowDialog() == DialogResult.OK)
        {
            string filePath = openFileDialog.FileName;
            PBImage.Image = Image.FromFile(filePath);

            DetectObjects(filePath);
        }
    }
}

6. Exception Handling

Ensure you have proper exception handling to catch and debug any runtime errors:

private void DetectObjects(string imagePath)
{
    try
    {
        // Preprocess the image
        var imageTensor = PreprocessImage(imagePath);

        // Prepare inputs for the vision model
        var visionInputs = PrepareVisionInputs(imagePath);

        // Run the vision model
        var visualFeatures = RunVisionModel(visionInputs);

        // Prepare inputs for the text model
        var textInputs = PrepareTextInputs(visualFeatures);

        // Run the text model
        var response = RunTextModel(textInputs);

        // Display the response
        MessageBox.Show($"Model Response: {response}");
    }
    catch (Exception ex)
    {
        Debug.Print(ex.Message.ToString());
        MessageBox.Show($"Error during inference: {ex.Message}");
    }
}

By following these steps, you should be able to resolve the tensor dimension mismatch and successfully run your application to describe images using the Phi-3 model.

@zydjohnHotmail
Copy link
Author

Hi, thanks you very much for your help & code, now I have changed my code, it looks like this now: using System;
using System.Diagnostics;
using System.Drawing;
using System.Drawing.Drawing2D;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
using Newtonsoft.Json.Linq;

namespace TestPhi3Form
{
public partial class Form1 : Form
{
private string visionModelPath =
@"D:\Phi3\Phi3-vision-128k-instruct-onnx-cpu\Phi3-vision-128k-instruct-onnx-cpu\phi-3-v-128k-instruct-vision.onnx";
private string textModelPath =
@"D:\Phi3\Phi3-vision-128k-instruct-onnx-cpu\Phi3-vision-128k-instruct-onnx-cpu\phi-3-v-128k-instruct-text.onnx";
private string tokenPath =
@"D:\Phi3\Phi3-vision-128k-instruct-onnx-cpu\Phi3-vision-128k-instruct-onnx-cpu\tokenizer.json";

    private Dictionary<int, string> tokenIdToTextMap;

    private InferenceSession visionSession;
    private InferenceSession textSession;

    public Form1()
    {
        InitializeComponent();
        visionSession = new InferenceSession(visionModelPath);
        textSession = new InferenceSession(textModelPath);
        LoadTokenizerMapping(tokenPath);
    }

    private void LoadTokenizerMapping(string tokenizerJsonPath)
    {
        // Read the tokenizer.json file
        var json = File.ReadAllText(tokenizerJsonPath);
        var tokenizerConfig = JObject.Parse(json);

        // Extract the "added_tokens" section
        var addedTokens = tokenizerConfig["added_tokens"];

        // Create a dictionary to map token IDs to their text
        tokenIdToTextMap = new Dictionary<int, string>();
        foreach (var token in addedTokens)
        {
            int id = token["id"].ToObject<int>();
            string content = token["content"].ToString();
            tokenIdToTextMap[id] = content;
        }
    }

    private Tensor<float> PreprocessImage(Image image)
    {
        // Resize the image to the required input size (e.g., 224x224)
        const int targetWidth = 224;
        const int targetHeight = 224;
        Bitmap resizedImage = new Bitmap(image, new Size(targetWidth, targetHeight));

        // Convert the image to a tensor with an additional dimension
        var tensor = new DenseTensor<float>(new[] { 1, 1, 3, targetHeight, targetWidth });

        for (int y = 0; y < targetHeight; y++)
        {
            for (int x = 0; x < targetWidth; x++)
            {
                Color pixel = resizedImage.GetPixel(x, y);

                // Normalize RGB values to [0, 1] range
                tensor[0, 0, 0, y, x] = pixel.R / 255f;
                tensor[0, 0, 1, y, x] = pixel.G / 255f;
                tensor[0, 0, 2, y, x] = pixel.B / 255f;
            }
        }
        return tensor;
    }

    private Tensor<float> PreprocessImage(string imagePath)
    {
        // Load the image
        using var bitmap = new Bitmap(imagePath);

        // Ensure the image is of size 854x480
        const int targetWidth = 854;
        const int targetHeight = 480;

        // Convert to a Tensor<float> in NCHW format (batch, channels, height, width)
        var tensor = new DenseTensor<float>(new[] { 1, 3, targetHeight, targetWidth }); // Batch, Channels, Height, Width
        for (int y = 0; y < targetHeight; y++)
        {
            for (int x = 0; x < targetWidth; x++)
            {
                var pixel = bitmap.GetPixel(x, y);

                // Normalize pixel values to [0, 1] and populate the tensor
                tensor[0, 0, y, x] = pixel.R / 255f; // Red channel
                tensor[0, 1, y, x] = pixel.G / 255f; // Green channel
                tensor[0, 2, y, x] = pixel.B / 255f; // Blue channel
            }
        }

        return tensor;
    }

    private List<NamedOnnxValue> PrepareVisionInputs(string imagePath)
    {
        // Preprocess the image
        var imageTensor = PreprocessImage(imagePath);

        // Prepare the image size tensor (Height and Width)
        var imageSizeTensor = new DenseTensor<long>(new[] { 1, 2 });
        imageSizeTensor[0, 0] = 480; // Height
        imageSizeTensor[0, 1] = 854; // Width

        // Debugging: Print the tensor dimensions and types
        Debug.Print($"Image tensor dimensions: {string.Join(", ", imageTensor.Dimensions.ToArray())}");
        Debug.Print($"Image size tensor dimensions: {string.Join(", ", imageSizeTensor.Dimensions.ToArray())}");

        // Create inputs for the vision model
        var inputs = new List<NamedOnnxValue>
        {
            NamedOnnxValue.CreateFromTensor("pixel_values", imageTensor),
            NamedOnnxValue.CreateFromTensor("image_sizes", imageSizeTensor)
        };

        return inputs;
    }
    
    private List<NamedOnnxValue> PrepareTextInputs(Tensor<float> visualFeatures)
    {
        // Create a dummy attention mask (adjust based on your model's requirements)
        var attentionMask = new DenseTensor<float>(new[] { 1, visualFeatures.Dimensions[1] });

        // Create inputs for the text model
        var inputs = new List<NamedOnnxValue>
        {
           NamedOnnxValue.CreateFromTensor("inputs_embeds", visualFeatures),
           NamedOnnxValue.CreateFromTensor("attention_mask", attentionMask)
        };

        return inputs;
    }


    private string GenerateResponse(Tensor<float> visionOutput)
    {
        // Prepare inputs for the text model
        var inputs = new List<NamedOnnxValue>
        {
            NamedOnnxValue.CreateFromTensor("input", visionOutput)
        };

        // Run the text model
        using (var results = textSession.Run(inputs))
        {
            var output = results.First().AsTensor<float>();
            return ProcessTextOutput(output);
        }
    }

    private string ProcessTextOutput(Tensor<float> output)
    {
        // Convert the float array to a byte array
        var floatArray = output.ToArray();
        var byteArray = new byte[floatArray.Length * sizeof(float)];
        Buffer.BlockCopy(floatArray, 0, byteArray, 0, byteArray.Length);

        // Convert the byte array to a string
        var text = Encoding.UTF8.GetString(byteArray);
        return text;
    }


    private string DecodeTokenIds(int[] tokenIds)
    {
        // Decode the token IDs into text using the tokenizer mapping
        var text = string.Join("", tokenIds.Select(id => tokenIdToTextMap.ContainsKey(id) ? tokenIdToTextMap[id] : $"<unk:{id}>"));
        return text;
    }

    private void DetectObjects(string imagePath)
    {
        try
        {
            // Preprocess the image
            var imageTensor = PreprocessImage(imagePath);

            // Prepare inputs for the vision model
            var visionInputs = PrepareVisionInputs(imagePath);

            // Run the vision model
            var visualFeatures = RunVisionModel(visionInputs);

            // Prepare inputs for the text model
            var textInputs = PrepareTextInputs(visualFeatures);

            // Run the text model
            var response = RunTextModel(textInputs);

            // Display the response
            MessageBox.Show($"Model Response: {response}");
        }
        catch (Exception ex)
        {
            Debug.Print(ex.Message.ToString());
            MessageBox.Show($"Error during inference: {ex.Message}");
        }
    }


    private string RunTextModel(List<NamedOnnxValue> inputs)
    {
        using (var results = textSession.Run(inputs))
        {
            var logits = results.First().AsTensor<float>();

            // Decode the logits into text (adjust based on your tokenizer)
            var tokenIds = logits.ToArray().Select(x => (int)x).ToArray();
            var text = DecodeTokenIds(tokenIds);
            return text;
        }
    }


    private Tensor<float> RunVisionModel(List<NamedOnnxValue> inputs)
    {
        using (var results = visionSession.Run(inputs))
        {
            var visualFeatures = results.First().AsTensor<float>();
            return visualFeatures;
        }
    }


    private Tensor<float> ConvertImageToTensor(Image image)
    {
        // Resize the image to the model's input size (e.g., 224x224)
        const int targetWidth = 224;
        const int targetHeight = 224;

        Bitmap resizedImage = new Bitmap(image, new Size(targetWidth, targetHeight));
        var tensor = new DenseTensor<float>(new[] { 1, 3, targetHeight, targetWidth });

        for (int y = 0; y < targetHeight; y++)
        {
            for (int x = 0; x < targetWidth; x++)
            {
                Color pixel = resizedImage.GetPixel(x, y);

                // Normalize RGB values to [0, 1] range
                tensor[0, 0, y, x] = pixel.R / 255f;
                tensor[0, 1, y, x] = pixel.G / 255f;
                tensor[0, 2, y, x] = pixel.B / 255f;
            }
        }

        return tensor;
    }

    private (List<Rectangle>, List<string>) ProcessOutput(float[] output)
    {
        var boxes = new List<Rectangle>();
        var descriptions = new List<string>();

        // Example logic for processing output (adjust based on your model)
        int numObjects = output.Length / 6; // Assuming 6 values per object: x, y, width, height, classId, confidence
        for (int i = 0; i < numObjects; i++)
        {
            int offset = i * 6;
            float x = output[offset] * PBImage.Width;
            float y = output[offset + 1] * PBImage.Height;
            float width = output[offset + 2] * PBImage.Width;
            float height = output[offset + 3] * PBImage.Height;
            int classId = (int)output[offset + 4];
            float confidence = output[offset + 5];

            if (confidence > 0.5) // Confidence threshold
            {
                boxes.Add(new Rectangle((int)x, (int)y, (int)width, (int)height));
                descriptions.Add($"Object {classId} with confidence {confidence:P}");
            }
        }

        return (boxes, descriptions);
    }

    private void DrawBoundingBoxes(List<Rectangle> boxes, List<string> descriptions)
    {
        if (PBImage.Image == null)
            return;

        Image image = (Image)PBImage.Image.Clone();
        using (Graphics g = Graphics.FromImage(image))
        {
            Pen pen = new Pen(Color.Red, 2);
            Font font = new Font("Arial", 10);
            Brush brush = new SolidBrush(Color.Yellow);

            for (int i = 0; i < boxes.Count; i++)
            {
                g.DrawRectangle(pen, boxes[i]);
                g.DrawString(descriptions[i], font, brush, boxes[i].Location);
            }
        }

        PBImage.Image = image;
    }


    private void BTNInfer_Click(object sender, EventArgs e)
    {
        using (OpenFileDialog openFileDialog = new OpenFileDialog())
        {
            openFileDialog.Filter = "Image Files|*.png";

            if (openFileDialog.ShowDialog() == DialogResult.OK)
            {
                string filePath = openFileDialog.FileName;
                PBImage.Image = Image.FromFile(filePath);

                DetectObjects(filePath);
            }
        }
    }

    private void BTNFinish_Click(object sender, EventArgs e)
    {
        Console.Beep();
        Environment.Exit(0);
    }
}

}
=> But when I run the code, I got different kind of error. I don't totally understand, but it seems there is always something not correct, please look at the screenshot to understand why I still got issues. I am using image size of 854 by 480 pixels, not 224 by 224 in size.Image

Thanks,

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants