AI 자율주행과 엔드투엔드 (End-to-End) 딥러닝 완전 가이드: Tesla FSD, Waymo, 엔드투엔드 솔루션 실전

서론
자율주행은 "모듈화 (Modular)"에서 "엔드투엔드 (End-to-End)"로 나아가고 있습니다. Tesla FSD V12는 순수 비전 엔드투엔드 솔루션의 가능성을 증명했으며, Waymo는 Robotaxi 분야에서 지속적으로 심화 연구를 진행하고 있습니다. 2026년, 엔드투엔드 자율주행은 이미 업계의 공통된 합의가 되었습니다.

자율주행 기술 개요
자율주행 기술 아키텍처 비교:

모듈화 솔루션 (전통적 방식)

인지 (Perception) → 계획 (Planning) → 제어 (Control)
장점: 설명 가능성 (Explainability)이 높음, 디버깅이 용이함
한계: 오차 누적, 복잡한 규칙
대표 사례: 대부분의 전통적인 자동차 기업

엔드투엔드 솔루션 (Tesla FSD V12)

센서 입력 (Sensor Input) → 주행 출력 (Driving Output)
장점: 오차 누적 없음, 인간과 유사한 주행
한계: 설명 가능성이 낮음, 방대한 데이터 필요
대표 사례: Tesla, Wayve

엔드투엔드 + 규칙 기반 보완 (Rule-based fallback)

장점: 안전성과 성능의 균형
대표 사례: Huawei, XPeng

멀티모달 거대 모델 솔루션 (Multimodal Large Model)

VLM + 주행 전략
장점: 일반화 (Generalization) 능력이 강함
대표 사례: DriveGPT, UniAD

Tesla FSD 아키텍처
FSD 핵심 구현

import torch
import torch.nn as nn
from typing import Dict, List, Tuple, Optional
import numpy as np

class OccupancyNetwork:
    """
    점유 네트워크 (Occupancy Network)
    """
    def __init__(self, voxel_size: float = 0.1):
        self.voxel_size = voxel_size
        self.encoder = None
        self.decoder = None

    def build_encoder(self, input_channels: int = 12):
        """인코더 구축"""
        self.encoder = nn.Sequential(
            nn.Conv3d(input_channels, 64, kernel_size=3, padding=1),
            nn.BatchNorm3d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool3d(2),
            nn.Conv3d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm3d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool3d(2),
            nn.Conv3d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm3d(256),
            nn.ReLU(inplace=True),
        )

    def voxelize(self, points: np.ndarray, grid_range: Tuple[float, float, float, float, float, float] = (-50, 50, -50, 50, -5, 5)) -> torch.Tensor:
        # points: (N, 3) 포인트 클라우드 (Point Cloud)
        # grid_range: (x_min, x_max, y_min, y_max, z_min, z_max)
        x_min, x_max, y_min, y_max, z_min, z_max = grid_range
        voxel_x = ((points[:, 0] - x_min) / self.voxel_size).astype(int)
        voxel_y = ((points[:, 1] - y_min) / self.voxel_size).astype(int)
        voxel_z = ((points[:, 2] - z_min) / self.voxel_size).astype(int)
        
        mask = (voxel_x >= 0) & (voxel_x < int((x_max - x_min) / self.voxel_size)) & \
               (voxel_y >= 0) & (voxel_y < int((y_max - y_min) / self.voxel_size)) & \
               (voxel_z >= 0) & (voxel_z < int((z_max - z_min) / self.voxel_size))

voxel_size )) return points [ valid ], voxel_x [ valid ], voxel_y [ valid ], voxel_z [ valid ] def forward ( self , points : np . ndarray ) -> Dict : valid_points , vx , vy , vz = self . voxelize ( points ) grid_size = ( int ( 100 / self . voxel_size ), int ( 100 / self . voxel_size ), int ( 10 / self . voxel_size ) voxel_grid = torch . zeros ( grid_size , dtype = torch . float32 ) # 체적화된 그리드 초기화 (밀도로 단순화) for i in range ( len ( valid_points )): voxel_grid [ vx [ i ], vy [ i ], vz [ i ]] += 1 voxel_tensor = voxel_grid . unsqueeze ( 0 ). unsqueeze ( 0 ) # (1, 1, D, H, W) features = self . encoder ( voxel_tensor ) " 점유율 (occupancy) " : features , " 점 (points) " : valid_points class BirdEyeView : "

鸟瞰图 (BEV) 특징 추출" def init ( self , feature_dim : int = 256 ): self . feature_dim = feature_dim def build_bev ( multi_scale_features : List [ torch . Tensor ], target_size : Tuple [ int , int ] = ( 200 , 200 ) ) -> torch . Tensor : multi_scale_features : 다중 스케일 특징 리스트 target_size : ( H , W ) fused = torch . zeros ( 1 , self . feature_dim , target_size [ 0 ], target_size [ 1 ] for feat in multi_scale_features : feat_up = torch . nn . functional . interpolate ( size = target_size , mode = ' bilinear ' , align_corners = False fused += feat_up return fused def bev_to_image_coords ( bev_coords : np . ndarray , origin : Tuple [ float , float ] = ( 0 , 0 ), resolution : float = 0.1 ) -> np . ndarray : "

BEV 좌표를 이미지 좌표로 변환" x , y = bev_coords [:, 0 ], bev_coords [:, 1 ] img_x = (( x - origin [ 0 ]) / resolution ). astype ( int ) img_y = (( y - origin [ 1 ]) / resolution ). astype ( int ) return np . stack ([ img_x , img_y ], axis =- 1 ) class PlanningNetwork ( nn . Module ): def init ( self , input_dim : int = 512 , hidden_dim : int = 256 ): super (). init () self . planner = nn . Sequential ( nn . Linear ( input_dim , hidden_dim ), nn . ReLU ( inplace = True ), nn . Dropout ( 0.1 ), nn . Linear ( hidden_dim , hidden_dim ), nn . ReLU ( inplace = True ), nn . Dropout ( 0.1 ), nn .

Linear(hidden_dim, 2), # (steering, throttle)
def forward(bev_features: torch.Tensor, route_features: torch.Tensor, traffic_features: torch.Tensor) -> Dict[str, torch.Tensor]:
# bev_features : BEV 특징
# route_features : 경로 특징
# traffic_features : 교통 상태 특징
{ "trajectory" : ..., "control" : ...}
bev_flat = bev_features.flatten(1)
combined = torch.cat([route_features, traffic_features])
trajectory = self.planner(combined)
control = torch.tanh(trajectory)
return { "trajectory" : trajectory , "control" : control }
class FSDStackedHourglass(nn.Module):
"""FSD Hourglass 네트워크"""
def init(self, num_joints: int = 2):
super().init()
self.num_joints = num_joints
self.encoder = nn.Sequential(
nn.Conv2d(12, 64, kernel_size=7, stride=2, padding=3),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
nn.MaxPool2d(2)
)
self._make_layer(64, 128)
self._make_layer(128, 128)
self._make_layer(128, 256)
self.hourglass = self._make_hourglass(256, num_joints)

def _make_layer(self, in_ch: int, out_ch: int) -> nn.Module:
    return nn.Sequential(
        nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=1),
        nn.BatchNorm2d(out_ch),
        nn.ReLU(inplace=True),
        nn.Conv2d(out_ch, out_ch, kernel_size=3, padding=1),
        nn.BatchNorm2d(out_ch),
        nn.ReLU(inplace=True)

def _make_hourglass(self, channels: int, num_joints: int) -> nn.Module:
"""Hourglass 모듈 구성"""
return nn.Sequential(
self._make_layer(channels, channels),
self._make_layer(channels, channels),
nn.MaxPool2d(2),
self._make_layer(channels, channels2),
self._make_layer(channels2, channels2),
nn.MaxPool2d(2),
self._make_layer(channels2, channels2),
nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False),
self._make_layer(channels2, channels*2)
)

Upsample(scale_factor=2, mode='bilinear', align_corners=False), nn.Conv2d(channels*2, num_joints, kernel_size=1), def forward(self, x: torch.Tensor) -> List[torch.Tensor]: encoded = self.encoder(x)
outputs = self.hourglass(encoded)
return outputs

端到端自动驾驶 UniAD 아키텍처 (UniAD Architecture)
class UniAD(nn.Module):
""" UniAD는 엔드투엔드 자율주행을 위한 모델입니다."""
def init(self, config: Dict):
super().init()
self.config = config
self.backbone = self._build_backbone()
self.neck = self._build_neck()
self.bev_encoder = BEVEncoder(in_channels=512, out_channels=config.get("bev_channels", 256))
self.map_head = MapHead(config)
self.detection_head = DetectionHead(config)
self.motion_head = MotionHead(config)
self.planning_head = PlanningHead(config)
def _build_backbone(self) -> nn.Module:
""" 백본 네트워크 구축 """
return nn.Sequential(
nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
nn.MaxPool2d(3, stride=2, padding=1),
# ResNet 블록
self._make_resblock(64, 64, num_blocks=3),
self._make_resblock(64, 128, num_blocks=4, stride=2),
self._make_resblock(128, 256, num_blocks=6, stride=2),
self._make_resblock(256, 512, num_blocks=3, stride=2)
)
def _make_resblock(in_ch: int, out_ch: int, num_blocks: int, stride: int = 1) -> nn.Module:
""" ResNet 블록 """
layers = []
layers.append(nn.Conv2d(in_ch, out_ch, kernel_size=3, stride=stride, padding=1))
layers.append(nn.BatchNorm2d(out_ch))
layers.append(nn.ReLU(inplace=True))
for _ in range(num_blocks - 1):
layers.append(nn.Conv2d(out_ch, out_ch, kernel_size=3, padding=1))
layers.append(nn.BatchNorm2d(out_ch))
layers.append(nn.ReLU(inplace=True))
return nn.Sequential(*layers)
def _build_neck(self) -> nn.Module:
""" Neck (FPN) 구축 """
return nn.

Sequential (nn.Conv2d(512, 256, kernel_size=1), nn.BatchNorm2d(256), nn.ReLU(inplace=True), nn.Conv2d(256, 256, kernel_size=3, padding=1), nn.BatchNorm2d(256), nn.ReLU(inplace=True)),
def forward(images: torch.Tensor, intrinsics: torch.Tensor, extrinsics: torch.Tensor) -> Dict[str, torch.Tensor]: images: (B, N, 3, H, W) 다중 시점 이미지, intrinsics: (B, N, 3, 3) 내부 파라미터, extrinsics: (B, N, 4, 4) 외부 파라미터 "detection": {...}, "tracking": {...}, "map": {...}, "motion": {...}, "planning": {...}
B, N, C, H, W = images.shape
multi_view_features = []
for i in range(N):
img = images[:, i] # (B, 3, H, W)
feat = self.backbone(img) # (B, 512, H', W')
feat = self.neck(feat)
multi_view_features.append(feat) # 시점 융합 + BEV
bev_features = self.bev_encoder(multi_view_features, intrinsics)
detection = self.detection_head(bev_features)
tracking = self.detection_head(bev_features) # 단순화
map_pred = self.map_head(bev_features)
motion = self.motion_head(bev_features, detection["boxes"])
planning = self.planning_head(bev_features, motion["trajectories"])
return {"detection": detection, "tracking": tracking, "map": map_pred, "motion": motion, "planning": planning}
class BEVEncoder(nn.Module):
"""BEV 인코더"""
def init(self, in_channels: int, out_channels: int):
super().init()
self.encoder = nn.Sequential(
nn.Conv2d(in_channels, 128, kernel_size=3, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(inplace=True),
nn.Conv2d(128, out_channels, kernel_size=3, padding=1),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True)
)
def forward(self, multi_view_features: List[torch.Tensor], intrinsics: torch.Tensor, extrinsics: torch.Tensor) -> torch.

Tensor : multi_view_features : 다중 시점 특징 (multi-view features) 리스트, intrinsics : 내적 파라미터 (intrinsics), extrinsics : 외적 파라미터 (extrinsics) # 실제 사용 시 Transformer를 사용하여 시점 변환 수행
fused = torch.stack(multi_view_features, dim=1).mean(dim=1) # (B, C, H, W)
bev = self.encoder(fused)

class MapHead(nn.Module):
""" 지도 인지 헤드 (Map Perception Head) """
def init(self, config: Dict):
super().init()
self.lane_head = LaneDetectionHead(config)
self.segmentation_head = MapSegmentationHead(config)

def forward(self, bev_features: torch.Tensor) -> Dict[str, torch.Tensor]:
    lanes = self.lane_head(bev_features)
    segmentation = self.segmentation_head(bev_features)
    return {"lanes": lanes, "segmentation": segmentation}

class LaneDetectionHead(nn.Module):
""" 차선 검출 헤드 (Lane Detection Head) """
def init(self, config: Dict):
super().init()
self.head = nn.Sequential(
nn.Conv2d(256, 128, kernel_size=3, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(inplace=True),
nn.Conv2d(128, 64, kernel_size=3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
)
self.confidence = nn.Conv2d(64, 1, kernel_size=1)
self.offset = nn.Conv2d(64, 2, kernel_size=1) # 오프셋 (offset)

def forward(self, x: torch.Tensor) -> Dict[str, torch.Tensor]:
    feat = self.head(x)
    confidence = torch.sigmoid(self.confidence(feat))
    offset = self.offset(feat)
    return {"confidence": confidence, "offset": offset}

class MotionHead(nn.Module):
""" 모션 예측 헤드 (Motion Prediction Head) """
def init(self, config: Dict):
super().init()
self.num_agents = config.get("num_agents", 64)
self.future_frames = config.get("future_frames", 40)
self.motion_encoder = nn.GRU(
input_size=256,
hidden_size=256,
num_layers=2,
batch_first=True
)
self.trajectory_head = nn.Linear(256, self.future_frames * 2)

def forward(self, bev_features: torch.Tensor, detection_boxes: torch.Tensor, map_lanes: torch.Tensor) -> Dict[str, torch.Tensor]:

Tensor ]: detection_boxes : ( B , N , 5 ) [ x , y , w , h , angle ] map_lanes : 地图信息 B = bev_features . shape [ 0 ] # 简化：用检测框特征 agent_features = detection_boxes . flatten ( 1 ) # (B, N*5) encoded , _ = self . motion_encoder ( agent_features ) trajectories = self . trajectory_head ( encoded ) trajectories = trajectories . view ( B , - 1 , self . future_frames , 2 ) " trajectories " : trajectories , " probabilities " : torch . softmax ( torch . randn ( B , self . num_agents ), dim =- 1 ) class PlanningHead ( nn . Module ): def init ( self , config : Dict ): super (). init () self . planner = nn . Sequential ( nn . Linear ( 512 , 256 ), nn . ReLU ( inplace = True ), nn . Linear ( 256 , 128 ), nn . ReLU ( inplace = True ), nn . Linear ( 128 , 2 ), # (x, y)

AI 자율주행과 엔드투엔드 (End-to-End) 딥러닝 완전 가이드: Tesla FSD, Waymo, 엔드투엔드 솔루션 실전

요약

핵심 포인트

댓글