Qwen3.5-Vision C++ App

#include "vision_app.hpp" #include #include #include #include #include #include #include #include "httplib.h" namespace py = pybind11; using namespace pybind11::literals; namespace vision_app { const char* HTML_CONTENT = R"HTML( Qwen3.5-Vision C++ App

Upload Image

Click or drag image here

Task Category

Prompt

Processing through C++ Pybind11 Engine...

Inference Result

Waiting for input...

)HTML"; bool launch_app( const std::string& model_name, const std::string& architecture, int port ) { try { py::print("Initializing Python environment for Inference..."); // Load Python modules py::module_ torch = py::module_::import("torch"); py::module_ transformers = py::module_::import("transformers"); // Compute Devices & Dtypes py::object cuda = torch.attr("cuda"); std::string device = cuda.attr("is_available")().cast() ? "cuda" : "cpu"; py::object dtype = torch.attr("float16"); if (device == "cuda" && cuda.attr("is_bf16_supported")().cast()) { dtype = torch.attr("bfloat16"); } py::print("Loading model:", model_name, "with architecture:", architecture); py::object ModelClass = transformers.attr(architecture.c_str()); py::object model = ModelClass.attr("from_pretrained")( model_name, "torch_dtype"_a=dtype, "device_map"_a=device ).attr("eval")(); py::object processor = transformers.attr("AutoProcessor").attr("from_pretrained")(model_name); py::print("Model loaded successfully. Starting Native C++ HTTP Server on port", port, "..."); // Start HTTP Server httplib::Server svr; // Route: Serve Frontend GUI svr.Get("/", [](const httplib::Request&, httplib::Response& res) { res.set_content(HTML_CONTENT, "text/html"); }); // Route: API Endpoint (Handled by Pybind) svr.Post("/api/process", [&model, &processor, &device](const httplib::Request& req, httplib::Response& res) { // Re-acquire the Global Interpreter Lock since httplib handles requests on background threads py::gil_scoped_acquire acquire; try { py::module_ base64 = py::module_::import("base64"); py::module_ io = py::module_::import("io"); py::module_ PIL_Image = py::module_::import("PIL.Image"); py::module_ json = py::module_::import("json"); // Decode incoming JSON py::object parsed = json.attr("loads")(req.body); std::string image_b64_full = parsed["image"].cast(); std::string category = parsed["category"].cast(); std::string prompt = parsed["prompt"].cast(); // Strip data URI standard: 'data:image/jpeg;base64,' size_t comma_pos = image_b64_full.find(','); std::string image_b64 = (comma_pos != std::string::npos) ? image_b64_full.substr(comma_pos + 1) : image_b64_full; // Load image into PIL py::object image_bytes = base64.attr("b64decode")(image_b64); py::object bytes_io = io.attr("BytesIO")(image_bytes); py::object rgb_image = PIL_Image.attr("open")(bytes_io).attr("convert")("RGB"); rgb_image.attr("thumbnail")(py::make_tuple(512, 512)); // Process Prompt std::string full_prompt = prompt; if (category == "Caption") full_prompt = "Provide a " + prompt + " length caption for the image."; else if (category == "Point") full_prompt = "Provide 2d point coordinates for " + prompt + ". Report in JSON format."; else if (category == "Detect") full_prompt = "Provide bounding box coordinates for " + prompt + ". Report in JSON format."; py::list content; content.append(py::dict("type"_a="image", "image"_a=rgb_image)); content.append(py::dict("type"_a="text", "text"_a=full_prompt)); py::list messages; messages.append(py::dict("role"_a="user", "content"_a=content)); py::object text_prompt = processor.attr("apply_chat_template")( messages, "tokenize"_a=false, "add_generation_prompt"_a=true ); py::list texts; texts.append(text_prompt); py::list images; images.append(rgb_image); py::object batch_encoding = processor( "text"_a=texts, "images"_a=images, "return_tensors"_a="pt", "padding"_a=true ); py::object inputs = batch_encoding.attr("to")(device); py::dict inputs_dict = py::dict(inputs); py::object generated_ids = model.attr("generate")( **inputs_dict, "max_new_tokens"_a=512 ); // Strip the input prompt from generated output ids py::object input_ids = inputs.attr("get")("input_ids"); py::int_ input_len = py::len(input_ids[py::int_(0)]); py::object output_ids = generated_ids[py::make_tuple(py::int_(0), py::slice(input_len, py::none(), py::none()))]; // Final Decoding py::object out_text = processor.attr("decode")( output_ids, "skip_special_tokens"_a=true, "clean_up_tokenization_spaces"_a=false ); // Format successful return packet py::dict result; result["text"] = out_text; std::string json_res = json.attr("dumps")(result).cast(); res.set_content(json_res, "application/json"); } catch (const std::exception& e) { std::string error_msg = std::string(R"({"error": ")") + e.what() + R"("})"; res.set_content(error_msg, "application/json"); } }); std::string url = "http://localhost:" + std::to_string(port); std::cout << "\n=============================================\n"; std::cout << "App ready! Automatically opening: " << url << "\n"; std::cout << "=============================================\n\n"; // Launch browser in a detached thread slightly delayed so the server is up std::thread([url]() { std::this_thread::sleep_for(std::chrono::milliseconds(500)); #if defined(_WIN32) std::string cmd = "start " + url; system(cmd.c_str()); #elif defined(__APPLE__) std::string cmd = "open " + url; system(cmd.c_str()); #else std::string cmd = "xdg-open " + url; system(cmd.c_str()); #endif }).detach(); // Release the Python GIL for the duration of the server lifecycle so incoming requests can acquire it py::gil_scoped_release release; // Block execution and run the server svr.listen("0.0.0.0", port); return true; } catch (const py::error_already_set& e) { std::cerr << "Python error occurred: " << e.what() << std::endl; return false; } catch (const std::exception& e) { std::cerr << "C++ error occurred: " << e.what() << std::endl; return false; } } } // namespace vision_app

Qwen3.5 Vision