[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"project-infrastructure-engineer-for-distributed-model-training":3,"similar-infrastructure-engineer-for-distributed-model-training":29},{"id":4,"slug":5,"title":6,"skills":7,"budget":16,"duration":16,"location":16,"onsitePercent":16,"contractType":17,"foundAt":18,"category":19,"description":23,"rawText":24,"webTitle":25,"webText":26,"language":27,"projectId":16,"sourceUrl":28},10449,"infrastructure-engineer-for-distributed-model-training","Infrastructure Engineer for Distributed Model Training",[8,9,10,11,12,13,14,15],"PyTorch Distributed","Ray","CUDA","HPC networking","InfiniBand","RDMA","GPU computing","LLM training pipelines",null,"contracting","2026-06-03T06:06:04+00:00",{"id":20,"slug":21,"label":22},3,"ai_ml","AI & Machine Learning","Build infrastructure for distributed model training, optimize compute scheduling across large GPU fleets, and improve performance of LLM training pipelines. Focus on PyTorch Distributed, Ray, CUDA, and HPC networking technologies.","\u003Cp>\u003Cstrong>Focus\u003C\u002Fstrong>\u003C\u002Fp>\n\u003Cul>\n\u003Cli>Build infrastructure for distributed model training\u003C\u002Fli>\n\u003Cli>Optimize compute scheduling across large GPU fleets\u003C\u002Fli>\n\u003Cli>Improve performance of LLM training pipelines\u003C\u002Fli>\n\u003C\u002Ful>\n\u003Cp>\u003Cstrong>Tech\u003C\u002Fstrong>\u003C\u002Fp>\n\u003Cul>\n\u003Cli>PyTorch Distributed\u003C\u002Fli>\n\u003Cli>Ray\u003C\u002Fli>\n\u003Cli>CUDA\u003C\u002Fli>\n\u003Cli>HPC networking (InfiniBand \u002F RDMA)\u003C\u002Fli>\n\u003C\u002Ful>\n\u003Cp>Darwin Recruitment is acting as an Employment Agency in relation to this vacancy.\u003C\u002Fp>\n\u003Cp>\u003Cimg src=\"https:\u002F\u002Fcounter.adcourier.com\u002FUmVlY2UuV2FsZG9uLjI1NTExLjEyNzg0QGRhcndpbi5hcGxpdHJhay5jb20.gif\">\u003C\u002Fp>\n\u003Cp>\u003Cspan style=\"color: #ffffff\">Reece Waldon\u003C\u002Fspan>\u003C\u002Fp>\nAnsprechpartner: Reece Waldon\nE-Mail: Reece.Waldon@darwinrecruitment.com\nTelefon: +44 1277 287285","Infrastructure Engineer - Distributed Model Training & GPU Fleet Optimization","Wir suchen einen erfahrenen Infrastructure Engineer zur Entwicklung und Optimierung von Infrastrukturen für verteiltes Machine Learning Training. In dieser Position arbeiten Sie an der Skalierung von Large Language Model (LLM) Training-Pipelines und der effizienten Verwaltung großer GPU-Flotten.\n\nIhre Hauptaufgaben umfassen den Aufbau robuster Infrastrukturen für distributed model training, die Optimierung von Compute-Scheduling-Systemen für umfangreiche GPU-Cluster sowie die kontinuierliche Verbesserung der Performance von LLM-Training-Pipelines. Sie werden eng mit Machine Learning Engineers und Data Scientists zusammenarbeiten, um skalierbare Lösungen für rechenintensive AI-Workloads zu entwickeln.\n\nTechnische Anforderungen: Fundierte Erfahrung mit PyTorch Distributed für paralleles Training, praktische Kenntnisse in Ray für verteilte Computing-Aufgaben, sowie CUDA-Programmierung für GPU-Optimierung. Zusätzlich sind Kenntnisse in High-Performance Computing (HPC) Networking, insbesondere InfiniBand und RDMA-Technologien, erforderlich.\n\nSie bringen idealerweise mehrjährige Erfahrung in der Entwicklung und Wartung von ML-Infrastrukturen mit, verstehen die Herausforderungen beim Training großer Sprachmodelle und haben bereits mit distributed computing frameworks gearbeitet. Kenntnisse in Container-Orchestrierung, Cloud-Plattformen und Performance-Monitoring-Tools sind von Vorteil.\n\nWir bieten eine spannende Gelegenheit, an der Spitze der AI-Infrastruktur-Entwicklung zu arbeiten und maßgeblich zur Skalierung modernster Machine Learning Systeme beizutragen.","en","https:\u002F\u002Fwww.darwinrecruitment.com\u002Fjob\u002F5255116551675-distributed-ai-systems-engineer-remote\u002F",{"items":30},[31,47,58,73,88,106,125,148,169,180,190,208,226,244,265],{"id":32,"slug":33,"title":34,"skills":35,"budget":16,"duration":16,"location":16,"onsitePercent":16,"contractType":17,"foundAt":45,"category":46},10464,"computer-vision-engineer-for-robotics-perception-stack","Computer Vision Engineer for Robotics Perception Stack",[36,37,38,39,40,41,42,43,44],"Computer vision","Sensor fusion","LiDAR","Cameras","PyTorch","TensorFlow","Object detection","Tracking","Scene understanding","2026-06-03T06:06:17+00:00",{"id":20,"slug":21,"label":22},{"id":48,"slug":49,"title":50,"skills":51,"budget":16,"duration":16,"location":16,"onsitePercent":16,"contractType":17,"foundAt":56,"category":57},10417,"ai-hardware-security-engineer-2","AI Hardware Security Engineer",[52,53,54,55],"Secure firmware","Hardware root of trust","Trusted execution environments","Low-level systems programming","2026-06-03T06:05:34+00:00",{"id":20,"slug":21,"label":22},{"id":59,"slug":60,"title":61,"skills":62,"budget":16,"duration":16,"location":16,"onsitePercent":16,"contractType":17,"foundAt":71,"category":72},10401,"ai-inference-platform-engineer-confidential-computing","AI Inference Platform Engineer - Confidential Computing",[63,64,65,66,67,68,69,70],"Kubernetes","GPU clusters","Confidential computing","Rust","Go","C++","AI inference","ML infrastructure","2026-06-03T06:05:20+00:00",{"id":20,"slug":21,"label":22},{"id":74,"slug":75,"title":76,"skills":77,"budget":16,"duration":16,"location":16,"onsitePercent":16,"contractType":17,"foundAt":86,"category":87},10385,"confidential-ai-systems-engineer-with-tee-expertise","Confidential AI Systems Engineer with TEE expertise",[78,79,80,81,82,83,84,40,10,85],"TEEs","SGX","SEV","TrustZone","Secure boot","Hardware attestation","Confidential containers","AI workloads","2026-06-03T06:05:04+00:00",{"id":20,"slug":21,"label":22},{"id":89,"slug":90,"title":91,"skills":92,"budget":16,"duration":101,"location":102,"onsitePercent":103,"contractType":17,"foundAt":104,"category":105},10341,"ai-engineer-llm-and-rag-systems","AI Engineer - LLM and RAG Systems",[93,94,95,96,97,98,99,100],"Python","LLMs","RAG","embeddings","prompt engineering","AWS","vector databases","microservices","3 Monate (Verlängerung erwartet, ~1 Jahr Gesamtlaufzeit)","Utrecht",50,"2026-06-03T06:04:26+00:00",{"id":20,"slug":21,"label":22},{"id":107,"slug":108,"title":109,"skills":110,"budget":16,"duration":16,"location":16,"onsitePercent":16,"contractType":122,"foundAt":123,"category":124},9009,"senior-npu-kernel-operator-engineer","Senior NPU Kernel \u002F Operator Engineer",[111,93,112,113,114,115,116,117,118,119,120,121],"C\u002FC++","Tensor computation","Neural network operators","Memory hierarchy","Bandwidth and latency analysis","Cache\u002FSRAM behaviour","Parallelism and synchronization","Data locality and vectorization","Performance optimization","Accelerator programming","GPU\u002FNPU development","permanent","2026-06-03T05:31:14+00:00",{"id":20,"slug":21,"label":22},{"id":126,"slug":127,"title":128,"skills":129,"budget":16,"duration":16,"location":145,"onsitePercent":103,"contractType":17,"foundAt":146,"category":147},8140,"ai-and-telco-architect","AI and Telco Architect",[130,131,132,133,134,135,136,137,138,139,140,141,142,143,144],"OSS","Assurance","Fulfillment","Inventory","Fault management","Capacity planning","AI\u002FML technologies","Real-time telemetry","Streaming technologies","Kafka","gNMI","OpenTelemetry","Enterprise architecture","Integration","Stakeholder communication","Netherlands","2026-06-03T05:07:08+00:00",{"id":20,"slug":21,"label":22},{"id":149,"slug":150,"title":151,"skills":152,"budget":164,"duration":16,"location":165,"onsitePercent":166,"contractType":122,"foundAt":167,"category":168},7744,"senior-gpu-systems-ai-infrastructure-engineer-nyc","Senior GPU Systems \u002F AI Infrastructure Engineer (NYC)",[153,154,155,156,68,66,93,40,157,158,159,9,160,161,162,163],"CUDA programming","GPU kernel optimization","parallel computing","distributed systems","JAX","NCCL","MPI","performance profiling","Nsight","Triton","HIP","Competitive + equity","New York City",75,"2026-06-03T04:48:20+00:00",{"id":20,"slug":21,"label":22},{"id":170,"slug":171,"title":172,"skills":173,"budget":16,"duration":16,"location":16,"onsitePercent":16,"contractType":17,"foundAt":178,"category":179},7629,"ai-compute-cluster-engineer","AI Compute Cluster Engineer",[11,63,174,175,176,177],"GPU scheduling","AI compute clusters","networking optimization","storage optimization","2026-06-03T04:37:11+00:00",{"id":20,"slug":21,"label":22},{"id":181,"slug":182,"title":183,"skills":184,"budget":186,"duration":16,"location":187,"onsitePercent":103,"contractType":122,"foundAt":188,"category":189},7608,"ai-telco-architect","AI Telco Architect",[130,131,132,133,134,135,136,137,138,139,140,141,142,185],"Integration experience","up to 90,000 EUR\u002Fyear","Amsterdam","2026-06-03T03:53:26+00:00",{"id":20,"slug":21,"label":22},{"id":191,"slug":192,"title":193,"skills":194,"budget":16,"duration":203,"location":204,"onsitePercent":205,"contractType":17,"foundAt":206,"category":207},7605,"ai-fullstack-engineer","AI Fullstack Engineer",[195,196,197,93,198,94,199,200,201,202],"React","TypeScript","Java","AI\u002FML","AI agents","LangChain","Vector Databases","Fullstack development","Initial 3 Months","Berlin",0,"2026-06-03T03:52:36+00:00",{"id":20,"slug":21,"label":22},{"id":209,"slug":210,"title":211,"skills":212,"budget":222,"duration":16,"location":223,"onsitePercent":16,"contractType":122,"foundAt":224,"category":225},7562,"ai-spezialist-mwd-ai-specialist","AI Spezialist (m\u002Fw\u002Fd) – AI Specialist",[93,213,214,215,216,217,98,218,219,220,221],"R","KI-Tools","Machine Learning","Datenverarbeitung","Cloud-Technologien","Azure","Google Cloud","Datenschutz","Compliance","mindestens 75.000 EUR\u002FJahr","Wien","2026-06-03T00:01:40+00:00",{"id":20,"slug":21,"label":22},{"id":227,"slug":228,"title":229,"skills":230,"budget":16,"duration":16,"location":240,"onsitePercent":241,"contractType":122,"foundAt":242,"category":243},7518,"manager-ki-und-prozessautomatisierung-mwd","Manager KI und Prozessautomatisierung (m\u002Fw\u002Fd)",[231,232,233,234,218,235,236,237,238,239],"KI","Prozessautomatisierung","Microsoft Copilot","Power Automate","ERP-Integration","SAP","Change Management","Digitalisierung","Large Language Models","Stephanskirchen",100,"2026-06-02T14:26:02+00:00",{"id":20,"slug":21,"label":22},{"id":245,"slug":246,"title":247,"skills":248,"budget":16,"duration":261,"location":16,"onsitePercent":262,"contractType":17,"foundAt":263,"category":264},7433,"ai-data-engineer-im-bereich-wissensmanagement-bots","AI Data Engineer im Bereich Wissensmanagement Bots",[249,93,250,251,252,253,254,139,255,256,257,258,259,260,217],"PostgreSQL","ETL\u002FELT-Pipelines","Big Data","SQL","Airflow","dbt","Spark","Data Engineering","Pandas","PySpark","Data Quality","Observability","6M+",20,"2026-06-02T09:30:40+00:00",{"id":20,"slug":21,"label":22},{"id":266,"slug":267,"title":268,"skills":269,"budget":16,"duration":16,"location":223,"onsitePercent":103,"contractType":17,"foundAt":274,"category":275},7414,"machine-learning-engineer-mwd","Machine Learning Engineer (m\u002Fw\u002Fd)",[215,41,40,93,213,270,98,219,271,272,273],"Apache Airflow","Datenmanagement","NLP","Computer Vision","2026-06-02T08:26:02+00:00",{"id":20,"slug":21,"label":22}]