iPXE 启动期间 iSCSI 读取错误

iPXE 启动期间 iSCSI 读取错误

编辑:于 2019-02-16 进行了大量更新,以包含更多故障排除信息。

我已经使用 iPXE 和 iSCSI 环境多年,但这是我第一次尝试进行 iSCSI 启动,而 iPXE 在与 iSCSI 目标的对话时遇到了问题。

存储服务器

 CentOS Linux release 7.6.1810 (Core)
Linux san1srvp01.********.net 3.10.0-957.5.1.el7.x86_64 #1 SMP Fri Feb 1 14:54:57 UTC 2019 x86_64 x86_64 x86_64 GNU/Linux
zfs-0.7.12-1.el7_6.x86_64

支持block实例

Disk /dev/zpool1/jane: 8422 MB, 8422687232 bytes, 16450561 sectors
Units = sectors of 1 * 512 = 512 bytes
Sector size (logical/physical): 512 bytes / 512 bytes
I/O size (minimum/optimal): 512 bytes / 512 bytes
Disk label type: dos
Disk identifier: 0xa9a554b4

           Device Boot      Start         End      Blocks   Id  System
/dev/zpool1/jane1              63       80324       40131   12  Compaq diagnostics
/dev/zpool1/jane2   *       80325    16434494     8177085    7  HPFS/NTFS/exFAT

支持fileio实例

Disk /zpool1/nas1/Media/c0d0.img: 8422 MB, 8422686720 bytes, 16450560 sectors
Units = sectors of 1 * 512 = 512 bytes
Sector size (logical/physical): 512 bytes / 512 bytes
I/O size (minimum/optimal): 512 bytes / 512 bytes
Disk label type: dos
Disk identifier: 0xa9a554b4

                      Device Boot      Start         End      Blocks   Id  System
/zpool1/nas1/Media/c0d0.img1              63       80324       40131   12  Compaq diagnostics
/zpool1/nas1/Media/c0d0.img2   *       80325    16434494     8177085    7  HPFS/NTFS/exFAT

简化的 iSCSI 目标配置。这是使用 ZFS zvol 的块的示例,我还尝试了 fileio,其行为没有什么不同。

{
  "fabric_modules": [
    {
      "discovery_enable_auth": true,
      "discovery_password": "********************************",
      "discovery_userid": "san1srvp01",
      "name": "iscsi"
    }
  ],
  "storage_objects": [
    {
      "alua_tpgs": [
        {
          "alua_access_state": 0,
          "alua_access_status": 0,
          "alua_access_type": 3,
          "alua_support_active_nonoptimized": 1,
          "alua_support_active_optimized": 1,
          "alua_support_offline": 1,
          "alua_support_standby": 1,
          "alua_support_transitioning": 1,
          "alua_support_unavailable": 1,
          "alua_write_metadata": 0,
          "implicit_trans_secs": 0,
          "name": "default_tg_pt_gp",
          "nonop_delay_msecs": 100,
          "preferred": 0,
          "tg_pt_gp_id": 0,
          "trans_delay_msecs": 0
        }
      ],
      "attributes": {
        "block_size": 512,
        "emulate_3pc": 1,
        "emulate_caw": 1,
        "emulate_dpo": 1,
        "emulate_fua_read": 1,
        "emulate_fua_write": 1,
        "emulate_model_alias": 1,
        "emulate_rest_reord": 0,
        "emulate_tas": 1,
        "emulate_tpu": 0,
        "emulate_tpws": 0,
        "emulate_ua_intlck_ctrl": 0,
        "emulate_write_cache": 0,
        "enforce_pr_isids": 1,
        "force_pr_aptpl": 0,
        "is_nonrot": 1,
        "max_unmap_block_desc_count": 1,
        "max_unmap_lba_count": 262144,
        "max_write_same_len": 65535,
        "optimal_sectors": 32768,
        "pi_prot_format": 0,
        "pi_prot_type": 0,
        "queue_depth": 128,
        "unmap_granularity": 16,
        "unmap_granularity_alignment": 0,
        "unmap_zeroes_data": 0
      },
      "dev": "/dev/zpool1/jane",
      "name": "jane",
      "plugin": "block",
      "readonly": false,
      "write_back": false,
      "wwn": "8688850f-7200-48a0-ad32-0f4f9397a836"
    }
  ],
  "targets": [
    {
      "fabric": "iscsi",
      "tpgs": [
        {
          "attributes": {
            "authentication": 0,
            "cache_dynamic_acls": 0,
            "default_cmdsn_depth": 64,
            "default_erl": 0,
            "demo_mode_discovery": 1,
            "demo_mode_write_protect": 1,
            "fabric_prot_type": 0,
            "generate_node_acls": 0,
            "login_timeout": 15,
            "netif_timeout": 2,
            "prod_mode_write_protect": 0,
            "t10_pi": 0,
            "tpg_enabled_sendtargets": 1
          },
          "enable": true,
          "luns": [
            {
              "alias": "414d07d6b4",
              "alua_tg_pt_gp_name": "default_tg_pt_gp",
              "index": 2,
              "storage_object": "/backstores/block/jane"
            }
          ],
          "node_acls": [
            {
              "attributes": {
                "dataout_timeout": 3,
                "dataout_timeout_retries": 5,
                "default_erl": 0,
                "nopin_response_timeout": 30,
                "nopin_timeout": 15,
                "random_datain_pdu_offsets": 0,
                "random_datain_seq_offsets": 0,
                "random_r2t_offsets": 0
              },
              "chap_mutual_password": "****************",
              "chap_mutual_userid": "san1srvp01",
              "chap_password": "****************",
              "chap_userid": "jane",
              "mapped_luns": [
                {
                  "alias": "c8ce872be3",
                  "index": 2,
                  "tpg_lun": 2,
                  "write_protect": false
                }
              ],
              "node_wwn": "iqn.1999-10.net.********:jane"
            }
          ],
          "parameters": {
            "AuthMethod": "CHAP,None",
            "DataDigest": "CRC32C,None",
            "DataPDUInOrder": "Yes",
            "DataSequenceInOrder": "Yes",
            "DefaultTime2Retain": "20",
            "DefaultTime2Wait": "2",
            "ErrorRecoveryLevel": "0",
            "FirstBurstLength": "65536",
            "HeaderDigest": "CRC32C,None",
            "IFMarkInt": "2048~65535",
            "IFMarker": "No",
            "ImmediateData": "Yes",
            "InitialR2T": "Yes",
            "MaxBurstLength": "262144",
            "MaxConnections": "1",
            "MaxOutstandingR2T": "1",
            "MaxRecvDataSegmentLength": "8192",
            "MaxXmitDataSegmentLength": "262144",
            "OFMarkInt": "2048~65535",
            "OFMarker": "No",
            "TargetAlias": "LIO Target"
          },
          "portals": [
            {
              "ip_address": "192.168.40.1",
              "iser": false,
              "offload": false,
              "port": 3260
            }
          ],
          "tag": 1
        }
      ],
      "wwn": "iqn.1999-10.net.********:san1srvp01"
    }
  ]
}

PXE/iPXE/TFTP/HTTP 服务器

CentOS release 6.10 (Final)
Linux sy1srvp01.********.net 2.6.32-754.10.1.el6.i686 #1 SMP Tue Jan 15 17:33:10 UTC 2019 i686 i686 i386 GNU/Linux
tftp-0.49-8.el6.i686

iPXE 实现将首先按顺序交给与主机名、uuid 或 mac 匹配的脚本。这是此 mac 的单独 iPXE 启动脚本mac-0007e90feaf5.ipxe

set username jane
set password ****************
set reverse-username san1srvp01
set reverse-password ****************
set initiator-iqn iqn.1999-10.net.********:jane
sanboot iscsi:192.168.40.1::::iqn.1999-10.net.********:san1srvp01

发起者

Compaq ML370 (Generation 0)
BIOS P17 (12/18/2002)
Processor 866/133 Mhz with 256k Cache
RAM 1 GB
Intel Boot Agent GE v1.2.22

PXE -> iPXE 链式加载

PXE 2.1 Build 084 (WfM 2.0), RPL V1.25

PX->EB: PXE! at 9CC2:0070, entry point at 9CC2:0106
            UNDI code segment 9CC2:0000, data segment 969B:0000 (602-628kB)
            UNDI device is PCI 00:06.0, type DIX+802.3
            602kB free base memory after PXE unload

 iPXE 1.0.0+ -- Open Source Network Boot Firmware -- http://ipxe.org
 Features: DNS HTTP iSCSI TFTP AoE ELF MBOOT PXE bzImage Menu PXEXT

我使用网络跟踪来跟踪 iPXE sanbootiSCSI 启动过程。从高层次来看,它是:

  1. 登录命令 (CHAP)
  2. 测试单元就绪
  3. 读取容量(10)
  4. 读取(10) <- 失败

首先,Read Capacity(10)返回 LBA 的意外值 63,而不是 16450560。然后它尝试在 LBA 64 处执行 Read(10),结果不出所料地失败了Logical Block Address Out Of Range。测试表明这是由 iPXE 和 LIO 之间的特定交互引起的,但确切原因尚不清楚。

读取容量(10) - 请求

Frame 27: 114 bytes on wire (912 bits), 114 bytes captured (912 bits)
Ethernet II, Src: Intel_0f:ea:f5 (00:07:e9:0f:ea:f5), Dst: SuperMic_6c:a9:82 (00:25:90:6c:a9:82)
Internet Protocol Version 4, Src: 192.168.4.13, Dst: 192.168.40.1
Transmission Control Protocol, Src Port: cifs (3020), Dst Port: iscsi-target (3260), Seq: 773, Ack: 637, Len: 48
iSCSI (SCSI Command)
Flags: 0xc1, F, R, Attr: Simple
SCSI CDB Read Capacity(10)
    [LUN: 0x0000]
    [Command Set:Direct Access Device (0x00) (Using default commandset)]
    [Response in: 29]
    Opcode: Read Capacity(10) (0x25)
    Control: 0x00

读取容量(10) - 响应

Frame 29: 74 bytes on wire (592 bits), 74 bytes captured (592 bits)
Ethernet II, Src: SuperMic_6c:a9:82 (00:25:90:6c:a9:82), Dst: Intel_0f:ea:f5 (00:07:e9:0f:ea:f5)
Internet Protocol Version 4, Src: 192.168.40.1, Dst: 192.168.4.13
Transmission Control Protocol, Src Port: iscsi-target (3260), Dst Port: cifs (3020), Seq: 685, Ack: 821, Len: 8
[2 Reassembled TCP Segments (56 bytes): #28(48), #29(8)]
iSCSI (SCSI Data In)
SCSI Payload (Read Capacity(10) Response Data)
    [LUN: 0x0000]
    [Command Set:Direct Access Device (0x00) (Using default commandset)]
    [SBC Opcode: Read Capacity(10) (0x25)]
    [Request in: 27]
    [Response in: 29]
    LBA: 63 (0 MB)
    Block size in bytes: 512
SCSI Response (Read Capacity(10))
    [LUN: 0x0000]
    [Command Set:Direct Access Device (0x00) (Using default commandset)]
    [SBC Opcode: Read Capacity(10) (0x25)]
    [Request in: 27]
    [Time from request: 0.000252000 seconds]
    [Status: Good (0x00)]

阅读(10) - 请求

Frame 32: 114 bytes on wire (912 bits), 114 bytes captured (912 bits)
Ethernet II, Src: Intel_0f:ea:f5 (00:07:e9:0f:ea:f5), Dst: SuperMic_6c:a9:82 (00:25:90:6c:a9:82)
Internet Protocol Version 4, Src: 192.168.4.13, Dst: 192.168.40.1
Transmission Control Protocol, Src Port: cifs (3020), Dst Port: iscsi-target (3260), Seq: 821, Ack: 693, Len: 48
iSCSI (SCSI Command)
Flags: 0xc1, F, R, Attr: Simple
SCSI CDB Read(10)
    [LUN: 0x0000]
    [Command Set:Direct Access Device (0x00) (Using default commandset)]
    [Response in: 33]
    Opcode: Read(10) (0x28)
    Flags: 0x00
    Logical Block Address (LBA): 64
    ...0 0000 = Group: 0x00
    Transfer Length: 4
    Control: 0x00

阅读(10) - 响应

Frame 33: 214 bytes on wire (1712 bits), 214 bytes captured (1712 bits)
Ethernet II, Src: SuperMic_6c:a9:82 (00:25:90:6c:a9:82), Dst: Intel_0f:ea:f5 (00:07:e9:0f:ea:f5)
Internet Protocol Version 4, Src: 192.168.40.1, Dst: 192.168.4.13
Transmission Control Protocol, Src Port: iscsi-target (3260), Dst Port: cifs (3020), Seq: 693, Ack: 869, Len: 148
iSCSI (SCSI Response)
Flags: 0x80
SCSI: SNS Info
    [LUN: 0x0000]
    .111 0000 = SNS Error Type: Current Error (0x70)
    Valid: 112
    0... .... = Filemark: False
    .0.. .... = EOM: False
    ..0. .... = ILI: False
    .... 0101 = Sense Key: Illegal Request (0x5)
    Sense Info: 0x00000000
    Additional Sense Length: 10
    Command-Specific Information: 00000000
    Additional Sense Code+Qualifier: Logical Block Address Out Of Range (0x2100)
    Field Replaceable Unit Code: 0x00
    0... .... = SKSV: False
    .000 0000 0000 0000 0000 0000 = Sense Key Specific: 0x000000

iPXE 对控制台的响应

Could not open SAN device: Input/output error (http://ipxe.org/1d704039
Could not boot image: Input/output error (http://ipxe.org/1d704039

在 iSCSI 目标上记录的消息

Feb 13 09:17:41 san1srvp01 kernel: cmd exceeds last lba 64 (lba 64, sectors 4)

测试和故障排除

  • 尝试使用此 iSCSI LUN 启动 VM 时观察到相同的行为,排除物理机及其网卡。
  • 使用本机 Linux 启动器挂载时,iSCSI 设备行为正常,并且该设备最初通过 iSCSI 挂载使用 dd 将映像文件复制到该设备。
  • 我创建了一个自定义的 iPXE 版本,强制Read Capacity(16)Read(16),但无济于事。
  • 我发现了一个有记录的类似行为实例,该行为被确定为由阶段期间提供(或未提供)的操作参数导致的Login。作为回应,我创建了一个自定义版本,其参数和参数值与使用本机 Linux 启动器进行工作安装期间观察到的参数和参数值相同,但无济于事。
  • 我尝试将 fileio 备用存储映像从 ZFS 移动到 xfs 卷,但无济于事。
  • 我尝试过使用块设备初始化为零的 iPXE sanboot,但无济于事。这表明问题与分区或块设备内容无关。

问题

  • 有人能证明有与此相当的工作设置吗?
  • 有人知道这个设置到底存在什么问题吗?
  • 有谁知道是什么原因导致 LIO 出现这种行为?
  • 最难的问题是,有人知道如何解决这个问题吗?

—短暂性脑缺血发作

答案1

该卷是存储服务器上的 LUN 2,iPXE 需要明确处理 LUN 2。存储服务器甚至没有 LUN 0,因此不清楚为什么成功Test Unit Ready了。有点尴尬,因为这是件如此简单的事情。

相关内容