Merge branch '3leaf'

This commit is contained in:
Michael Brown
2007-10-29 17:21:58 +00:00
10 changed files with 8058 additions and 1 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,461 @@
#ifndef _ARBEL_H
#define _ARBEL_H
/** @file
*
* Mellanox Arbel Infiniband HCA driver
*
*/
#include <stdint.h>
#include <gpxe/uaccess.h>
#include "mlx_bitops.h"
#include "MT25218_PRM.h"
/*
* Hardware constants
*
*/
/* PCI BARs */
#define ARBEL_PCI_CONFIG_BAR PCI_BASE_ADDRESS_0
#define ARBEL_PCI_CONFIG_BAR_SIZE 0x100000
#define ARBEL_PCI_UAR_BAR PCI_BASE_ADDRESS_2
#define ARBEL_PCI_UAR_IDX 1
#define ARBEL_PCI_UAR_SIZE 0x1000
/* UAR context table (UCE) resource types */
#define ARBEL_UAR_RES_NONE 0x00
#define ARBEL_UAR_RES_CQ_CI 0x01
#define ARBEL_UAR_RES_CQ_ARM 0x02
#define ARBEL_UAR_RES_SQ 0x03
#define ARBEL_UAR_RES_RQ 0x04
#define ARBEL_UAR_RES_GROUP_SEP 0x07
/* Work queue entry and completion queue entry opcodes */
#define ARBEL_OPCODE_SEND 0x0a
#define ARBEL_OPCODE_RECV_ERROR 0xfe
#define ARBEL_OPCODE_SEND_ERROR 0xff
/* HCA command register opcodes */
#define ARBEL_HCR_QUERY_DEV_LIM 0x0003
#define ARBEL_HCR_QUERY_FW 0x0004
#define ARBEL_HCR_INIT_HCA 0x0007
#define ARBEL_HCR_CLOSE_HCA 0x0008
#define ARBEL_HCR_INIT_IB 0x0009
#define ARBEL_HCR_CLOSE_IB 0x000a
#define ARBEL_HCR_SW2HW_MPT 0x000d
#define ARBEL_HCR_MAP_EQ 0x0012
#define ARBEL_HCR_SW2HW_EQ 0x0013
#define ARBEL_HCR_HW2SW_EQ 0x0014
#define ARBEL_HCR_SW2HW_CQ 0x0016
#define ARBEL_HCR_HW2SW_CQ 0x0017
#define ARBEL_HCR_RST2INIT_QPEE 0x0019
#define ARBEL_HCR_INIT2RTR_QPEE 0x001a
#define ARBEL_HCR_RTR2RTS_QPEE 0x001b
#define ARBEL_HCR_2RST_QPEE 0x0021
#define ARBEL_HCR_MAD_IFC 0x0024
#define ARBEL_HCR_READ_MGM 0x0025
#define ARBEL_HCR_WRITE_MGM 0x0026
#define ARBEL_HCR_MGID_HASH 0x0027
#define ARBEL_HCR_RUN_FW 0x0ff6
#define ARBEL_HCR_DISABLE_LAM 0x0ff7
#define ARBEL_HCR_ENABLE_LAM 0x0ff8
#define ARBEL_HCR_UNMAP_ICM 0x0ff9
#define ARBEL_HCR_MAP_ICM 0x0ffa
#define ARBEL_HCR_UNMAP_ICM_AUX 0x0ffb
#define ARBEL_HCR_MAP_ICM_AUX 0x0ffc
#define ARBEL_HCR_SET_ICM_SIZE 0x0ffd
#define ARBEL_HCR_UNMAP_FA 0x0ffe
#define ARBEL_HCR_MAP_FA 0x0fff
/* Service types */
#define ARBEL_ST_UD 0x03
/* MTUs */
#define ARBEL_MTU_2048 0x04
#define ARBEL_NO_EQ 64
#define ARBEL_INVALID_LKEY 0x00000100UL
#define ARBEL_PAGE_SIZE 4096
#define ARBEL_DB_POST_SND_OFFSET 0x10
/*
* Datatypes that seem to be missing from the autogenerated documentation
*
*/
struct arbelprm_mgm_hash_st {
pseudo_bit_t reserved0[0x00020];
/* -------------- */
pseudo_bit_t hash[0x00010];
pseudo_bit_t reserved1[0x00010];
} __attribute__ (( packed ));
struct arbelprm_scalar_parameter_st {
pseudo_bit_t reserved0[0x00020];
/* -------------- */
pseudo_bit_t value[0x00020];
} __attribute__ (( packed ));
/*
* Wrapper structures for hardware datatypes
*
*/
struct MLX_DECLARE_STRUCT ( arbelprm_access_lam );
struct MLX_DECLARE_STRUCT ( arbelprm_completion_queue_context );
struct MLX_DECLARE_STRUCT ( arbelprm_completion_queue_entry );
struct MLX_DECLARE_STRUCT ( arbelprm_completion_with_error );
struct MLX_DECLARE_STRUCT ( arbelprm_cq_arm_db_record );
struct MLX_DECLARE_STRUCT ( arbelprm_cq_ci_db_record );
struct MLX_DECLARE_STRUCT ( arbelprm_eqc );
struct MLX_DECLARE_STRUCT ( arbelprm_hca_command_register );
struct MLX_DECLARE_STRUCT ( arbelprm_init_hca );
struct MLX_DECLARE_STRUCT ( arbelprm_init_ib );
struct MLX_DECLARE_STRUCT ( arbelprm_mad_ifc );
struct MLX_DECLARE_STRUCT ( arbelprm_mgm_entry );
struct MLX_DECLARE_STRUCT ( arbelprm_mgm_hash );
struct MLX_DECLARE_STRUCT ( arbelprm_mpt );
struct MLX_DECLARE_STRUCT ( arbelprm_qp_db_record );
struct MLX_DECLARE_STRUCT ( arbelprm_qp_ee_state_transitions );
struct MLX_DECLARE_STRUCT ( arbelprm_query_dev_lim );
struct MLX_DECLARE_STRUCT ( arbelprm_query_fw );
struct MLX_DECLARE_STRUCT ( arbelprm_queue_pair_ee_context_entry );
struct MLX_DECLARE_STRUCT ( arbelprm_recv_wqe_segment_next );
struct MLX_DECLARE_STRUCT ( arbelprm_scalar_parameter );
struct MLX_DECLARE_STRUCT ( arbelprm_send_doorbell );
struct MLX_DECLARE_STRUCT ( arbelprm_ud_address_vector );
struct MLX_DECLARE_STRUCT ( arbelprm_virtual_physical_mapping );
struct MLX_DECLARE_STRUCT ( arbelprm_wqe_segment_ctrl_send );
struct MLX_DECLARE_STRUCT ( arbelprm_wqe_segment_data_ptr );
struct MLX_DECLARE_STRUCT ( arbelprm_wqe_segment_next );
struct MLX_DECLARE_STRUCT ( arbelprm_wqe_segment_ud );
/*
* Composite hardware datatypes
*
*/
#define ARBEL_MAX_GATHER 1
struct arbelprm_ud_send_wqe {
struct arbelprm_wqe_segment_next next;
struct arbelprm_wqe_segment_ctrl_send ctrl;
struct arbelprm_wqe_segment_ud ud;
struct arbelprm_wqe_segment_data_ptr data[ARBEL_MAX_GATHER];
} __attribute__ (( packed ));
#define ARBEL_MAX_SCATTER 1
struct arbelprm_recv_wqe {
/* The autogenerated header is inconsistent between send and
* receive WQEs. The "ctrl" structure for receive WQEs is
* defined to include the "next" structure. Since the "ctrl"
* part of the "ctrl" structure contains only "reserved, must
* be zero" bits, we ignore its definition and provide
* something more usable.
*/
struct arbelprm_recv_wqe_segment_next next;
uint32_t ctrl[2]; /* All "reserved, must be zero" */
struct arbelprm_wqe_segment_data_ptr data[ARBEL_MAX_SCATTER];
} __attribute__ (( packed ));
union arbelprm_completion_entry {
struct arbelprm_completion_queue_entry normal;
struct arbelprm_completion_with_error error;
} __attribute__ (( packed ));
union arbelprm_doorbell_record {
struct arbelprm_cq_arm_db_record cq_arm;
struct arbelprm_cq_ci_db_record cq_ci;
struct arbelprm_qp_db_record qp;
} __attribute__ (( packed ));
union arbelprm_doorbell_register {
struct arbelprm_send_doorbell send;
uint32_t dword[2];
} __attribute__ (( packed ));
union arbelprm_mad {
struct arbelprm_mad_ifc ifc;
union ib_mad mad;
} __attribute__ (( packed ));
/*
* gPXE-specific definitions
*
*/
/** Arbel device limits */
struct arbel_dev_limits {
/** Number of reserved QPs */
unsigned int reserved_qps;
/** QP context entry size */
size_t qpc_entry_size;
/** Extended QP context entry size */
size_t eqpc_entry_size;
/** Number of reserved SRQs */
unsigned int reserved_srqs;
/** SRQ context entry size */
size_t srqc_entry_size;
/** Number of reserved EEs */
unsigned int reserved_ees;
/** EE context entry size */
size_t eec_entry_size;
/** Extended EE context entry size */
size_t eeec_entry_size;
/** Number of reserved CQs */
unsigned int reserved_cqs;
/** CQ context entry size */
size_t cqc_entry_size;
/** Number of reserved MTTs */
unsigned int reserved_mtts;
/** MTT entry size */
size_t mtt_entry_size;
/** Number of reserved MRWs */
unsigned int reserved_mrws;
/** MPT entry size */
size_t mpt_entry_size;
/** Number of reserved RDBs */
unsigned int reserved_rdbs;
/** EQ context entry size */
size_t eqc_entry_size;
/** Number of reserved UARs */
unsigned int reserved_uars;
};
/** Alignment of Arbel send work queue entries */
#define ARBEL_SEND_WQE_ALIGN 128
/** An Arbel send work queue entry */
union arbel_send_wqe {
struct arbelprm_ud_send_wqe ud;
uint8_t force_align[ARBEL_SEND_WQE_ALIGN];
} __attribute__ (( packed ));
/** An Arbel send work queue */
struct arbel_send_work_queue {
/** Doorbell record number */
unsigned int doorbell_idx;
/** Work queue entries */
union arbel_send_wqe *wqe;
/** Size of work queue */
size_t wqe_size;
};
/** Alignment of Arbel receive work queue entries */
#define ARBEL_RECV_WQE_ALIGN 64
/** An Arbel receive work queue entry */
union arbel_recv_wqe {
struct arbelprm_recv_wqe recv;
uint8_t force_align[ARBEL_RECV_WQE_ALIGN];
} __attribute__ (( packed ));
/** An Arbel receive work queue */
struct arbel_recv_work_queue {
/** Doorbell record number */
unsigned int doorbell_idx;
/** Work queue entries */
union arbel_recv_wqe *wqe;
/** Size of work queue */
size_t wqe_size;
};
/** Maximum number of allocatable queue pairs
*
* This is a policy decision, not a device limit.
*/
#define ARBEL_MAX_QPS 8
/** Base queue pair number */
#define ARBEL_QPN_BASE 0x550000
/** An Arbel queue pair */
struct arbel_queue_pair {
/** Send work queue */
struct arbel_send_work_queue send;
/** Receive work queue */
struct arbel_recv_work_queue recv;
};
/** Maximum number of allocatable completion queues
*
* This is a policy decision, not a device limit.
*/
#define ARBEL_MAX_CQS 8
/** An Arbel completion queue */
struct arbel_completion_queue {
/** Consumer counter doorbell record number */
unsigned int ci_doorbell_idx;
/** Arm queue doorbell record number */
unsigned int arm_doorbell_idx;
/** Completion queue entries */
union arbelprm_completion_entry *cqe;
/** Size of completion queue */
size_t cqe_size;
};
/** An Arbel resource bitmask */
typedef uint32_t arbel_bitmask_t;
/** Size of an Arbel resource bitmask */
#define ARBEL_BITMASK_SIZE(max_entries) \
( ( (max_entries) + ( 8 * sizeof ( arbel_bitmask_t ) ) - 1 ) / \
( 8 * sizeof ( arbel_bitmask_t ) ) )
/** An Arbel device */
struct arbel {
/** PCI configuration registers */
void *config;
/** PCI user Access Region */
void *uar;
/** Command input mailbox */
void *mailbox_in;
/** Command output mailbox */
void *mailbox_out;
/** Firmware area in external memory */
userptr_t firmware_area;
/** ICM size */
size_t icm_len;
/** ICM AUX size */
size_t icm_aux_len;
/** ICM area */
userptr_t icm;
/** Doorbell records */
union arbelprm_doorbell_record *db_rec;
/** Reserved LKey
*
* Used to get unrestricted memory access.
*/
unsigned long reserved_lkey;
/** Completion queue in-use bitmask */
arbel_bitmask_t cq_inuse[ ARBEL_BITMASK_SIZE ( ARBEL_MAX_CQS ) ];
/** Queue pair in-use bitmask */
arbel_bitmask_t qp_inuse[ ARBEL_BITMASK_SIZE ( ARBEL_MAX_QPS ) ];
/** Device limits */
struct arbel_dev_limits limits;
};
/** Global protection domain */
#define ARBEL_GLOBAL_PD 0x123456
/** Memory key prefix */
#define ARBEL_MKEY_PREFIX 0x77000000UL
/*
* HCA commands
*
*/
#define ARBEL_HCR_BASE 0x80680
#define ARBEL_HCR_REG(x) ( ARBEL_HCR_BASE + 4 * (x) )
#define ARBEL_HCR_MAX_WAIT_MS 2000
#define ARBEL_MBOX_ALIGN 4096
#define ARBEL_MBOX_SIZE 512
/* HCA command is split into
*
* bits 11:0 Opcode
* bit 12 Input uses mailbox
* bit 13 Output uses mailbox
* bits 22:14 Input parameter length (in dwords)
* bits 31:23 Output parameter length (in dwords)
*
* Encoding the information in this way allows us to cut out several
* parameters to the arbel_command() call.
*/
#define ARBEL_HCR_IN_MBOX 0x00001000UL
#define ARBEL_HCR_OUT_MBOX 0x00002000UL
#define ARBEL_HCR_OPCODE( _command ) ( (_command) & 0xfff )
#define ARBEL_HCR_IN_LEN( _command ) ( ( (_command) >> 12 ) & 0x7fc )
#define ARBEL_HCR_OUT_LEN( _command ) ( ( (_command) >> 21 ) & 0x7fc )
/** Build HCR command from component parts */
#define ARBEL_HCR_INOUT_CMD( _opcode, _in_mbox, _in_len, \
_out_mbox, _out_len ) \
( (_opcode) | \
( (_in_mbox) ? ARBEL_HCR_IN_MBOX : 0 ) | \
( ( (_in_len) / 4 ) << 14 ) | \
( (_out_mbox) ? ARBEL_HCR_OUT_MBOX : 0 ) | \
( ( (_out_len) / 4 ) << 23 ) )
#define ARBEL_HCR_IN_CMD( _opcode, _in_mbox, _in_len ) \
ARBEL_HCR_INOUT_CMD ( _opcode, _in_mbox, _in_len, 0, 0 )
#define ARBEL_HCR_OUT_CMD( _opcode, _out_mbox, _out_len ) \
ARBEL_HCR_INOUT_CMD ( _opcode, 0, 0, _out_mbox, _out_len )
#define ARBEL_HCR_VOID_CMD( _opcode ) \
ARBEL_HCR_INOUT_CMD ( _opcode, 0, 0, 0, 0 )
/*
* Doorbell record allocation
*
* The doorbell record map looks like:
*
* ARBEL_MAX_CQS * Arm completion queue doorbell
* ARBEL_MAX_QPS * Send work request doorbell
* Group separator
* ...(empty space)...
* ARBEL_MAX_QPS * Receive work request doorbell
* ARBEL_MAX_CQS * Completion queue consumer counter update doorbell
*/
#define ARBEL_MAX_DOORBELL_RECORDS 512
#define ARBEL_GROUP_SEPARATOR_DOORBELL ( ARBEL_MAX_CQS + ARBEL_MAX_QPS )
/**
* Get arm completion queue doorbell index
*
* @v cqn_offset Completion queue number offset
* @ret doorbell_idx Doorbell index
*/
static inline unsigned int
arbel_cq_arm_doorbell_idx ( unsigned int cqn_offset ) {
return cqn_offset;
}
/**
* Get send work request doorbell index
*
* @v qpn_offset Queue pair number offset
* @ret doorbell_idx Doorbell index
*/
static inline unsigned int
arbel_send_doorbell_idx ( unsigned int qpn_offset ) {
return ( ARBEL_MAX_CQS + qpn_offset );
}
/**
* Get receive work request doorbell index
*
* @v qpn_offset Queue pair number offset
* @ret doorbell_idx Doorbell index
*/
static inline unsigned int
arbel_recv_doorbell_idx ( unsigned int qpn_offset ) {
return ( ARBEL_MAX_DOORBELL_RECORDS - ARBEL_MAX_CQS - qpn_offset - 1 );
}
/**
* Get completion queue consumer counter doorbell index
*
* @v cqn_offset Completion queue number offset
* @ret doorbell_idx Doorbell index
*/
static inline unsigned int
arbel_cq_ci_doorbell_idx ( unsigned int cqn_offset ) {
return ( ARBEL_MAX_DOORBELL_RECORDS - cqn_offset - 1 );
}
#endif /* _ARBEL_H */

View File

@@ -0,0 +1,209 @@
#ifndef _MLX_BITOPS_H
#define _MLX_BITOPS_H
/*
* Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/**
* @file
*
* Mellanox bit operations
*
*/
/* Datatype used to represent a bit in the Mellanox autogenerated headers */
typedef unsigned char pseudo_bit_t;
/**
* Wrapper structure for pseudo_bit_t structures
*
* This structure provides a wrapper around the autogenerated
* pseudo_bit_t structures. It has the correct size, and also
* encapsulates type information about the underlying pseudo_bit_t
* structure, which allows the MLX_FILL etc. macros to work without
* requiring explicit type information.
*/
#define MLX_DECLARE_STRUCT( _structure ) \
_structure { \
union { \
uint8_t bytes[ sizeof ( struct _structure ## _st ) / 8 ]; \
uint32_t dwords[ sizeof ( struct _structure ## _st ) / 32 ]; \
struct _structure ## _st *dummy[0]; \
} u; \
}
/** Get pseudo_bit_t structure type from wrapper structure pointer */
#define MLX_PSEUDO_STRUCT( _ptr ) \
typeof ( *((_ptr)->u.dummy[0]) )
/** Bit offset of a field within a pseudo_bit_t structure */
#define MLX_BIT_OFFSET( _structure_st, _field ) \
offsetof ( _structure_st, _field )
/** Dword offset of a field within a pseudo_bit_t structure */
#define MLX_DWORD_OFFSET( _structure_st, _field ) \
( MLX_BIT_OFFSET ( _structure_st, _field ) / 32 )
/** Dword bit offset of a field within a pseudo_bit_t structure
*
* Yes, using mod-32 would work, but would lose the check for the
* error of specifying a mismatched field name and dword index.
*/
#define MLX_DWORD_BIT_OFFSET( _structure_st, _index, _field ) \
( MLX_BIT_OFFSET ( _structure_st, _field ) - ( 32 * (_index) ) )
/** Bit width of a field within a pseudo_bit_t structure */
#define MLX_BIT_WIDTH( _structure_st, _field ) \
sizeof ( ( ( _structure_st * ) NULL )->_field )
/** Bit mask for a field within a pseudo_bit_t structure */
#define MLX_BIT_MASK( _structure_st, _field ) \
( ( ~( ( uint32_t ) 0 ) ) >> \
( 32 - MLX_BIT_WIDTH ( _structure_st, _field ) ) )
/*
* Assemble native-endian dword from named fields and values
*
*/
#define MLX_ASSEMBLE_1( _structure_st, _index, _field, _value ) \
( (_value) << MLX_DWORD_BIT_OFFSET ( _structure_st, _index, _field ) )
#define MLX_ASSEMBLE_2( _structure_st, _index, _field, _value, ... ) \
( MLX_ASSEMBLE_1 ( _structure_st, _index, _field, _value ) | \
MLX_ASSEMBLE_1 ( _structure_st, _index, __VA_ARGS__ ) )
#define MLX_ASSEMBLE_3( _structure_st, _index, _field, _value, ... ) \
( MLX_ASSEMBLE_1 ( _structure_st, _index, _field, _value ) | \
MLX_ASSEMBLE_2 ( _structure_st, _index, __VA_ARGS__ ) )
#define MLX_ASSEMBLE_4( _structure_st, _index, _field, _value, ... ) \
( MLX_ASSEMBLE_1 ( _structure_st, _index, _field, _value ) | \
MLX_ASSEMBLE_3 ( _structure_st, _index, __VA_ARGS__ ) )
#define MLX_ASSEMBLE_5( _structure_st, _index, _field, _value, ... ) \
( MLX_ASSEMBLE_1 ( _structure_st, _index, _field, _value ) | \
MLX_ASSEMBLE_4 ( _structure_st, _index, __VA_ARGS__ ) )
#define MLX_ASSEMBLE_6( _structure_st, _index, _field, _value, ... ) \
( MLX_ASSEMBLE_1 ( _structure_st, _index, _field, _value ) | \
MLX_ASSEMBLE_5 ( _structure_st, _index, __VA_ARGS__ ) )
/*
* Build native-endian (positive) dword bitmasks from named fields
*
*/
#define MLX_MASK_1( _structure_st, _index, _field ) \
( MLX_BIT_MASK ( _structure_st, _field ) << \
MLX_DWORD_BIT_OFFSET ( _structure_st, _index, _field ) )
#define MLX_MASK_2( _structure_st, _index, _field, ... ) \
( MLX_MASK_1 ( _structure_st, _index, _field ) | \
MLX_MASK_1 ( _structure_st, _index, __VA_ARGS__ ) )
#define MLX_MASK_3( _structure_st, _index, _field, ... ) \
( MLX_MASK_1 ( _structure_st, _index, _field ) | \
MLX_MASK_2 ( _structure_st, _index, __VA_ARGS__ ) )
#define MLX_MASK_4( _structure_st, _index, _field, ... ) \
( MLX_MASK_1 ( _structure_st, _index, _field ) | \
MLX_MASK_3 ( _structure_st, _index, __VA_ARGS__ ) )
#define MLX_MASK_5( _structure_st, _index, _field, ... ) \
( MLX_MASK_1 ( _structure_st, _index, _field ) | \
MLX_MASK_4 ( _structure_st, _index, __VA_ARGS__ ) )
#define MLX_MASK_6( _structure_st, _index, _field, ... ) \
( MLX_MASK_1 ( _structure_st, _index, _field ) | \
MLX_MASK_5 ( _structure_st, _index, __VA_ARGS__ ) )
/*
* Populate big-endian dwords from named fields and values
*
*/
#define MLX_FILL( _ptr, _index, _assembled ) \
do { \
uint32_t *__ptr = &(_ptr)->u.dwords[(_index)]; \
uint32_t __assembled = (_assembled); \
*__ptr = cpu_to_be32 ( __assembled ); \
} while ( 0 )
#define MLX_FILL_1( _ptr, _index, ... ) \
MLX_FILL ( _ptr, _index, MLX_ASSEMBLE_1 ( MLX_PSEUDO_STRUCT ( _ptr ),\
_index, __VA_ARGS__ ) )
#define MLX_FILL_2( _ptr, _index, ... ) \
MLX_FILL ( _ptr, _index, MLX_ASSEMBLE_2 ( MLX_PSEUDO_STRUCT ( _ptr ),\
_index, __VA_ARGS__ ) )
#define MLX_FILL_3( _ptr, _index, ... ) \
MLX_FILL ( _ptr, _index, MLX_ASSEMBLE_3 ( MLX_PSEUDO_STRUCT ( _ptr ),\
_index, __VA_ARGS__ ) )
#define MLX_FILL_4( _ptr, _index, ... ) \
MLX_FILL ( _ptr, _index, MLX_ASSEMBLE_4 ( MLX_PSEUDO_STRUCT ( _ptr ),\
_index, __VA_ARGS__ ) )
#define MLX_FILL_5( _ptr, _index, ... ) \
MLX_FILL ( _ptr, _index, MLX_ASSEMBLE_5 ( MLX_PSEUDO_STRUCT ( _ptr ),\
_index, __VA_ARGS__ ) )
#define MLX_FILL_6( _ptr, _index, ... ) \
MLX_FILL ( _ptr, _index, MLX_ASSEMBLE_6 ( MLX_PSEUDO_STRUCT ( _ptr ),\
_index, __VA_ARGS__ ) )
/*
* Modify big-endian dword using named field and value
*
*/
#define MLX_SET( _ptr, _field, _value ) \
do { \
unsigned int __index = \
MLX_DWORD_OFFSET ( MLX_PSEUDO_STRUCT ( _ptr ), _field ); \
uint32_t *__ptr = &(_ptr)->u.dwords[__index]; \
uint32_t __value = be32_to_cpu ( *__ptr ); \
__value &= ~( MLX_MASK_1 ( MLX_PSEUDO_STRUCT ( _ptr ), \
__index, _field ) ); \
__value |= MLX_ASSEMBLE_1 ( MLX_PSEUDO_STRUCT ( _ptr ), \
__index, _field, _value ); \
*__ptr = cpu_to_be32 ( __value ); \
} while ( 0 )
/*
* Extract value of named field
*
*/
#define MLX_GET( _ptr, _field ) \
( { \
unsigned int __index = \
MLX_DWORD_OFFSET ( MLX_PSEUDO_STRUCT ( _ptr ), _field ); \
uint32_t *__ptr = &(_ptr)->u.dwords[__index]; \
uint32_t __value = be32_to_cpu ( *__ptr ); \
__value >>= \
MLX_DWORD_BIT_OFFSET ( MLX_PSEUDO_STRUCT ( _ptr ), \
__index, _field ); \
__value &= \
MLX_BIT_MASK ( MLX_PSEUDO_STRUCT ( _ptr ), _field ); \
__value; \
} )
#endif /* _MLX_BITOPS_H */

930
src/drivers/net/ipoib.c Normal file
View File

@@ -0,0 +1,930 @@
/*
* Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <stdint.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <byteswap.h>
#include <errno.h>
#include "timer.h"
#include <gpxe/if_arp.h>
#include <gpxe/iobuf.h>
#include <gpxe/netdevice.h>
#include <gpxe/infiniband.h>
#include <gpxe/ipoib.h>
/** @file
*
* IP over Infiniband
*/
/** IPoIB MTU */
#define IPOIB_MTU 2048
/** Number of IPoIB data send work queue entries */
#define IPOIB_DATA_NUM_SEND_WQES 2
/** Number of IPoIB data receive work queue entries */
#define IPOIB_DATA_NUM_RECV_WQES 4
/** Number of IPoIB data completion entries */
#define IPOIB_DATA_NUM_CQES 8
/** Number of IPoIB metadata send work queue entries */
#define IPOIB_META_NUM_SEND_WQES 2
/** Number of IPoIB metadata receive work queue entries */
#define IPOIB_META_NUM_RECV_WQES 2
/** Number of IPoIB metadata completion entries */
#define IPOIB_META_NUM_CQES 8
/** An IPoIB queue set */
struct ipoib_queue_set {
/** Completion queue */
struct ib_completion_queue *cq;
/** Queue pair */
struct ib_queue_pair *qp;
/** Receive work queue fill level */
unsigned int recv_fill;
/** Receive work queue maximum fill level */
unsigned int recv_max_fill;
};
/** An IPoIB device */
struct ipoib_device {
/** Network device */
struct net_device *netdev;
/** Underlying Infiniband device */
struct ib_device *ibdev;
/** Data queue set */
struct ipoib_queue_set data;
/** Data queue set */
struct ipoib_queue_set meta;
/** Broadcast GID */
struct ib_gid broadcast_gid;
/** Broadcast LID */
unsigned int broadcast_lid;
/** Joined to broadcast group */
int broadcast_joined;
/** Data queue key */
unsigned long data_qkey;
};
/**
* IPoIB path cache entry
*
* This serves a similar role to the ARP cache for Ethernet. (ARP
* *is* used on IPoIB; we have two caches to maintain.)
*/
struct ipoib_cached_path {
/** Destination GID */
struct ib_gid gid;
/** Destination LID */
unsigned int dlid;
/** Service level */
unsigned int sl;
/** Rate */
unsigned int rate;
};
/** Number of IPoIB path cache entries */
#define IPOIB_NUM_CACHED_PATHS 2
/** IPoIB path cache */
static struct ipoib_cached_path ipoib_path_cache[IPOIB_NUM_CACHED_PATHS];
/** Oldest IPoIB path cache entry index */
static unsigned int ipoib_path_cache_idx = 0;
/** TID half used to identify get path record replies */
#define IPOIB_TID_GET_PATH_REC 0x11111111UL
/** TID half used to identify multicast member record replies */
#define IPOIB_TID_MC_MEMBER_REC 0x22222222UL
/** IPoIB metadata TID */
static uint32_t ipoib_meta_tid = 0;
/** IPv4 broadcast GID */
static const struct ib_gid ipv4_broadcast_gid = {
{ { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff } }
};
/** Maximum time we will wait for the broadcast join to succeed */
#define IPOIB_JOIN_MAX_DELAY_MS 1000
/****************************************************************************
*
* IPoIB link layer
*
****************************************************************************
*/
/** Broadcast QPN used in IPoIB MAC addresses
*
* This is a guaranteed invalid real QPN
*/
#define IPOIB_BROADCAST_QPN 0xffffffffUL
/** Broadcast IPoIB address */
static struct ipoib_mac ipoib_broadcast = {
.qpn = ntohl ( IPOIB_BROADCAST_QPN ),
};
/**
* Transmit IPoIB packet
*
* @v iobuf I/O buffer
* @v netdev Network device
* @v net_protocol Network-layer protocol
* @v ll_dest Link-layer destination address
*
* Prepends the IPoIB link-layer header and transmits the packet.
*/
static int ipoib_tx ( struct io_buffer *iobuf, struct net_device *netdev,
struct net_protocol *net_protocol,
const void *ll_dest ) {
struct ipoib_hdr *ipoib_hdr =
iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
/* Build IPoIB header */
memcpy ( &ipoib_hdr->pseudo.peer, ll_dest,
sizeof ( ipoib_hdr->pseudo.peer ) );
ipoib_hdr->real.proto = net_protocol->net_proto;
ipoib_hdr->real.reserved = 0;
/* Hand off to network device */
return netdev_tx ( netdev, iobuf );
}
/**
* Process received IPoIB packet
*
* @v iobuf I/O buffer
* @v netdev Network device
*
* Strips off the IPoIB link-layer header and passes up to the
* network-layer protocol.
*/
static int ipoib_rx ( struct io_buffer *iobuf, struct net_device *netdev ) {
struct ipoib_hdr *ipoib_hdr = iobuf->data;
/* Sanity check */
if ( iob_len ( iobuf ) < sizeof ( *ipoib_hdr ) ) {
DBG ( "IPoIB packet too short for link-layer header\n" );
DBG_HD ( iobuf->data, iob_len ( iobuf ) );
free_iob ( iobuf );
return -EINVAL;
}
/* Strip off IPoIB header */
iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
/* Hand off to network-layer protocol */
return net_rx ( iobuf, netdev, ipoib_hdr->real.proto,
&ipoib_hdr->pseudo.peer );
}
/**
* Transcribe IPoIB address
*
* @v ll_addr Link-layer address
* @ret string Link-layer address in human-readable format
*/
const char * ipoib_ntoa ( const void *ll_addr ) {
static char buf[45];
const struct ipoib_mac *mac = ll_addr;
snprintf ( buf, sizeof ( buf ), "%08lx:%08lx:%08lx:%08lx:%08lx",
htonl ( mac->qpn ), htonl ( mac->gid.u.dwords[0] ),
htonl ( mac->gid.u.dwords[1] ),
htonl ( mac->gid.u.dwords[2] ),
htonl ( mac->gid.u.dwords[3] ) );
return buf;
}
/** IPoIB protocol */
struct ll_protocol ipoib_protocol __ll_protocol = {
.name = "IPoIB",
.ll_proto = htons ( ARPHRD_INFINIBAND ),
.ll_addr_len = IPOIB_ALEN,
.ll_header_len = IPOIB_HLEN,
.ll_broadcast = ( uint8_t * ) &ipoib_broadcast,
.tx = ipoib_tx,
.rx = ipoib_rx,
.ntoa = ipoib_ntoa,
};
/****************************************************************************
*
* IPoIB network device
*
****************************************************************************
*/
/**
* Destroy queue set
*
* @v ipoib IPoIB device
* @v qset Queue set
*/
static void ipoib_destroy_qset ( struct ipoib_device *ipoib,
struct ipoib_queue_set *qset ) {
struct ib_device *ibdev = ipoib->ibdev;
if ( qset->qp )
ib_destroy_qp ( ibdev, qset->qp );
if ( qset->cq )
ib_destroy_cq ( ibdev, qset->cq );
memset ( qset, 0, sizeof ( *qset ) );
}
/**
* Create queue set
*
* @v ipoib IPoIB device
* @v qset Queue set
* @ret rc Return status code
*/
static int ipoib_create_qset ( struct ipoib_device *ipoib,
struct ipoib_queue_set *qset,
unsigned int num_cqes,
unsigned int num_send_wqes,
unsigned int num_recv_wqes,
unsigned long qkey ) {
struct ib_device *ibdev = ipoib->ibdev;
int rc;
/* Store queue parameters */
qset->recv_max_fill = num_recv_wqes;
/* Allocate completion queue */
qset->cq = ib_create_cq ( ibdev, num_cqes );
if ( ! qset->cq ) {
DBGC ( ipoib, "IPoIB %p could not allocate completion queue\n",
ipoib );
rc = -ENOMEM;
goto err;
}
/* Allocate queue pair */
qset->qp = ib_create_qp ( ibdev, num_send_wqes, qset->cq,
num_recv_wqes, qset->cq, qkey );
if ( ! qset->qp ) {
DBGC ( ipoib, "IPoIB %p could not allocate queue pair\n",
ipoib );
rc = -ENOMEM;
goto err;
}
qset->qp->owner_priv = ipoib->netdev;
return 0;
err:
ipoib_destroy_qset ( ipoib, qset );
return rc;
}
/**
* Find path cache entry by GID
*
* @v gid GID
* @ret entry Path cache entry, or NULL
*/
static struct ipoib_cached_path *
ipoib_find_cached_path ( struct ib_gid *gid ) {
struct ipoib_cached_path *path;
unsigned int i;
for ( i = 0 ; i < IPOIB_NUM_CACHED_PATHS ; i++ ) {
path = &ipoib_path_cache[i];
if ( memcmp ( &path->gid, gid, sizeof ( *gid ) ) == 0 )
return path;
}
DBG ( "IPoIB %08lx:%08lx:%08lx:%08lx cache miss\n",
htonl ( gid->u.dwords[0] ), htonl ( gid->u.dwords[1] ),
htonl ( gid->u.dwords[2] ), htonl ( gid->u.dwords[3] ) );
return NULL;
}
/**
* Transmit path record request
*
* @v ipoib IPoIB device
* @v gid Destination GID
* @ret rc Return status code
*/
static int ipoib_get_path_record ( struct ipoib_device *ipoib,
struct ib_gid *gid ) {
struct ib_device *ibdev = ipoib->ibdev;
struct io_buffer *iobuf;
struct ib_mad_path_record *path_record;
struct ib_address_vector av;
int rc;
/* Allocate I/O buffer */
iobuf = alloc_iob ( sizeof ( *path_record ) );
if ( ! iobuf )
return -ENOMEM;
iob_put ( iobuf, sizeof ( *path_record ) );
path_record = iobuf->data;
memset ( path_record, 0, sizeof ( *path_record ) );
/* Construct path record request */
path_record->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
path_record->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
path_record->mad_hdr.class_version = 2;
path_record->mad_hdr.method = IB_MGMT_METHOD_GET;
path_record->mad_hdr.attr_id = htons ( IB_SA_ATTR_PATH_REC );
path_record->mad_hdr.tid[0] = IPOIB_TID_GET_PATH_REC;
path_record->mad_hdr.tid[1] = ipoib_meta_tid++;
path_record->sa_hdr.comp_mask[1] =
htonl ( IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID );
memcpy ( &path_record->dgid, gid, sizeof ( path_record->dgid ) );
memcpy ( &path_record->sgid, &ibdev->port_gid,
sizeof ( path_record->sgid ) );
/* Construct address vector */
memset ( &av, 0, sizeof ( av ) );
av.dlid = ibdev->sm_lid;
av.dest_qp = IB_SA_QPN;
av.qkey = IB_GLOBAL_QKEY;
/* Post send request */
if ( ( rc = ib_post_send ( ibdev, ipoib->meta.qp, &av,
iobuf ) ) != 0 ) {
DBGC ( ipoib, "IPoIB %p could not send get path record: %s\n",
ipoib, strerror ( rc ) );
free_iob ( iobuf );
return rc;
}
return 0;
}
/**
* Transmit multicast group membership request
*
* @v ipoib IPoIB device
* @v gid Multicast GID
* @v join Join (rather than leave) group
* @ret rc Return status code
*/
static int ipoib_mc_member_record ( struct ipoib_device *ipoib,
struct ib_gid *gid, int join ) {
struct ib_device *ibdev = ipoib->ibdev;
struct io_buffer *iobuf;
struct ib_mad_mc_member_record *mc_member_record;
struct ib_address_vector av;
int rc;
/* Allocate I/O buffer */
iobuf = alloc_iob ( sizeof ( *mc_member_record ) );
if ( ! iobuf )
return -ENOMEM;
iob_put ( iobuf, sizeof ( *mc_member_record ) );
mc_member_record = iobuf->data;
memset ( mc_member_record, 0, sizeof ( *mc_member_record ) );
/* Construct path record request */
mc_member_record->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
mc_member_record->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
mc_member_record->mad_hdr.class_version = 2;
mc_member_record->mad_hdr.method =
( join ? IB_MGMT_METHOD_SET : IB_MGMT_METHOD_DELETE );
mc_member_record->mad_hdr.attr_id = htons ( IB_SA_ATTR_MC_MEMBER_REC );
mc_member_record->mad_hdr.tid[0] = IPOIB_TID_MC_MEMBER_REC;
mc_member_record->mad_hdr.tid[1] = ipoib_meta_tid++;
mc_member_record->sa_hdr.comp_mask[1] =
htonl ( IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID |
IB_SA_MCMEMBER_REC_JOIN_STATE );
mc_member_record->scope__join_state = 1;
memcpy ( &mc_member_record->mgid, gid,
sizeof ( mc_member_record->mgid ) );
memcpy ( &mc_member_record->port_gid, &ibdev->port_gid,
sizeof ( mc_member_record->port_gid ) );
/* Construct address vector */
memset ( &av, 0, sizeof ( av ) );
av.dlid = ibdev->sm_lid;
av.dest_qp = IB_SA_QPN;
av.qkey = IB_GLOBAL_QKEY;
/* Post send request */
if ( ( rc = ib_post_send ( ibdev, ipoib->meta.qp, &av,
iobuf ) ) != 0 ) {
DBGC ( ipoib, "IPoIB %p could not send get path record: %s\n",
ipoib, strerror ( rc ) );
free_iob ( iobuf );
return rc;
}
return 0;
}
/**
* Transmit packet via IPoIB network device
*
* @v netdev Network device
* @v iobuf I/O buffer
* @ret rc Return status code
*/
static int ipoib_transmit ( struct net_device *netdev,
struct io_buffer *iobuf ) {
struct ipoib_device *ipoib = netdev->priv;
struct ib_device *ibdev = ipoib->ibdev;
struct ipoib_pseudo_hdr *ipoib_pshdr = iobuf->data;
struct ib_address_vector av;
struct ib_gid *gid;
struct ipoib_cached_path *path;
int rc;
/* Sanity check */
if ( iob_len ( iobuf ) < sizeof ( *ipoib_pshdr ) ) {
DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
return -EINVAL;
}
iob_pull ( iobuf, ( sizeof ( *ipoib_pshdr ) ) );
/* Construct address vector */
memset ( &av, 0, sizeof ( av ) );
av.qkey = IB_GLOBAL_QKEY;
av.gid_present = 1;
if ( ipoib_pshdr->peer.qpn == htonl ( IPOIB_BROADCAST_QPN ) ) {
/* Broadcast address */
av.dest_qp = IB_BROADCAST_QPN;
av.dlid = ipoib->broadcast_lid;
gid = &ipoib->broadcast_gid;
} else {
/* Unicast - look in path cache */
path = ipoib_find_cached_path ( &ipoib_pshdr->peer.gid );
if ( ! path ) {
/* No path entry - get path record */
rc = ipoib_get_path_record ( ipoib,
&ipoib_pshdr->peer.gid );
netdev_tx_complete ( netdev, iobuf );
return rc;
}
av.dest_qp = ntohl ( ipoib_pshdr->peer.qpn );
av.dlid = path->dlid;
av.rate = path->rate;
av.sl = path->sl;
gid = &ipoib_pshdr->peer.gid;
}
memcpy ( &av.gid, gid, sizeof ( av.gid ) );
return ib_post_send ( ibdev, ipoib->data.qp, &av, iobuf );
}
/**
* Handle IPoIB data send completion
*
* @v ibdev Infiniband device
* @v qp Queue pair
* @v completion Completion
* @v iobuf I/O buffer
*/
static void ipoib_data_complete_send ( struct ib_device *ibdev __unused,
struct ib_queue_pair *qp,
struct ib_completion *completion,
struct io_buffer *iobuf ) {
struct net_device *netdev = qp->owner_priv;
netdev_tx_complete_err ( netdev, iobuf,
( completion->syndrome ? -EIO : 0 ) );
}
/**
* Handle IPoIB data receive completion
*
* @v ibdev Infiniband device
* @v qp Queue pair
* @v completion Completion
* @v iobuf I/O buffer
*/
static void ipoib_data_complete_recv ( struct ib_device *ibdev __unused,
struct ib_queue_pair *qp,
struct ib_completion *completion,
struct io_buffer *iobuf ) {
struct net_device *netdev = qp->owner_priv;
struct ipoib_device *ipoib = netdev->priv;
struct ipoib_pseudo_hdr *ipoib_pshdr;
if ( completion->syndrome ) {
netdev_rx_err ( netdev, iobuf, -EIO );
goto done;
}
iob_put ( iobuf, completion->len );
if ( iob_len ( iobuf ) < sizeof ( struct ib_global_route_header ) ) {
DBGC ( ipoib, "IPoIB %p received data packet too short to "
"contain GRH\n", ipoib );
DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
netdev_rx_err ( netdev, iobuf, -EIO );
goto done;
}
iob_pull ( iobuf, sizeof ( struct ib_global_route_header ) );
if ( iob_len ( iobuf ) < sizeof ( struct ipoib_real_hdr ) ) {
DBGC ( ipoib, "IPoIB %p received data packet too short to "
"contain IPoIB header\n", ipoib );
DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
netdev_rx_err ( netdev, iobuf, -EIO );
goto done;
}
ipoib_pshdr = iob_push ( iobuf, sizeof ( *ipoib_pshdr ) );
/* FIXME: fill in a MAC address for the sake of AoE! */
netdev_rx ( netdev, iobuf );
done:
ipoib->data.recv_fill--;
}
/**
* Handle IPoIB metadata send completion
*
* @v ibdev Infiniband device
* @v qp Queue pair
* @v completion Completion
* @v iobuf I/O buffer
*/
static void ipoib_meta_complete_send ( struct ib_device *ibdev __unused,
struct ib_queue_pair *qp,
struct ib_completion *completion,
struct io_buffer *iobuf ) {
struct net_device *netdev = qp->owner_priv;
struct ipoib_device *ipoib = netdev->priv;
if ( completion->syndrome ) {
DBGC ( ipoib, "IPoIB %p metadata TX completion error %x\n",
ipoib, completion->syndrome );
}
free_iob ( iobuf );
}
/**
* Handle received IPoIB path record
*
* @v ipoib IPoIB device
* @v path_record Path record
*/
static void ipoib_recv_path_record ( struct ipoib_device *ipoib __unused,
struct ib_mad_path_record *path_record ) {
struct ipoib_cached_path *path;
/* Update path cache entry */
path = &ipoib_path_cache[ipoib_path_cache_idx];
memcpy ( &path->gid, &path_record->dgid, sizeof ( path->gid ) );
path->dlid = ntohs ( path_record->dlid );
path->sl = ( path_record->reserved__sl & 0x0f );
path->rate = ( path_record->rate_selector__rate & 0x3f );
DBG ( "IPoIB %08lx:%08lx:%08lx:%08lx dlid %x sl %x rate %x\n",
htonl ( path->gid.u.dwords[0] ), htonl ( path->gid.u.dwords[1] ),
htonl ( path->gid.u.dwords[2] ), htonl ( path->gid.u.dwords[3] ),
path->dlid, path->sl, path->rate );
/* Update path cache index */
ipoib_path_cache_idx++;
if ( ipoib_path_cache_idx == IPOIB_NUM_CACHED_PATHS )
ipoib_path_cache_idx = 0;
}
/**
* Handle received IPoIB multicast membership record
*
* @v ipoib IPoIB device
* @v mc_member_record Multicast membership record
*/
static void ipoib_recv_mc_member_record ( struct ipoib_device *ipoib,
struct ib_mad_mc_member_record *mc_member_record ) {
/* Record parameters */
ipoib->broadcast_joined =
( mc_member_record->scope__join_state & 0x0f );
ipoib->data_qkey = ntohl ( mc_member_record->qkey );
ipoib->broadcast_lid = ntohs ( mc_member_record->mlid );
DBGC ( ipoib, "IPoIB %p %s broadcast group: qkey %lx mlid %x\n",
ipoib, ( ipoib->broadcast_joined ? "joined" : "left" ),
ipoib->data_qkey, ipoib->broadcast_lid );
}
/**
* Handle IPoIB metadata receive completion
*
* @v ibdev Infiniband device
* @v qp Queue pair
* @v completion Completion
* @v iobuf I/O buffer
*/
static void ipoib_meta_complete_recv ( struct ib_device *ibdev __unused,
struct ib_queue_pair *qp,
struct ib_completion *completion,
struct io_buffer *iobuf ) {
struct net_device *netdev = qp->owner_priv;
struct ipoib_device *ipoib = netdev->priv;
union ib_mad *mad;
if ( completion->syndrome ) {
DBGC ( ipoib, "IPoIB %p metadata RX completion error %x\n",
ipoib, completion->syndrome );
goto done;
}
iob_put ( iobuf, completion->len );
if ( iob_len ( iobuf ) < sizeof ( struct ib_global_route_header ) ) {
DBGC ( ipoib, "IPoIB %p received metadata packet too short "
"to contain GRH\n", ipoib );
DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
goto done;
}
iob_pull ( iobuf, sizeof ( struct ib_global_route_header ) );
if ( iob_len ( iobuf ) < sizeof ( *mad ) ) {
DBGC ( ipoib, "IPoIB %p received metadata packet too short "
"to contain reply\n", ipoib );
DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
goto done;
}
mad = iobuf->data;
if ( mad->mad_hdr.status != 0 ) {
DBGC ( ipoib, "IPoIB %p metadata RX err status %04x\n",
ipoib, ntohs ( mad->mad_hdr.status ) );
goto done;
}
switch ( mad->mad_hdr.tid[0] ) {
case IPOIB_TID_GET_PATH_REC:
ipoib_recv_path_record ( ipoib, &mad->path_record );
break;
case IPOIB_TID_MC_MEMBER_REC:
ipoib_recv_mc_member_record ( ipoib, &mad->mc_member_record );
break;
default:
DBGC ( ipoib, "IPoIB %p unwanted response:\n",
ipoib );
DBGC_HD ( ipoib, mad, sizeof ( *mad ) );
break;
}
done:
ipoib->meta.recv_fill--;
free_iob ( iobuf );
}
/**
* Refill IPoIB receive ring
*
* @v ipoib IPoIB device
*/
static void ipoib_refill_recv ( struct ipoib_device *ipoib,
struct ipoib_queue_set *qset ) {
struct ib_device *ibdev = ipoib->ibdev;
struct io_buffer *iobuf;
int rc;
while ( qset->recv_fill < qset->recv_max_fill ) {
iobuf = alloc_iob ( IPOIB_MTU );
if ( ! iobuf )
break;
if ( ( rc = ib_post_recv ( ibdev, qset->qp, iobuf ) ) != 0 ) {
free_iob ( iobuf );
break;
}
qset->recv_fill++;
}
}
/**
* Poll IPoIB network device
*
* @v netdev Network device
*/
static void ipoib_poll ( struct net_device *netdev ) {
struct ipoib_device *ipoib = netdev->priv;
struct ib_device *ibdev = ipoib->ibdev;
ib_poll_cq ( ibdev, ipoib->meta.cq, ipoib_meta_complete_send,
ipoib_meta_complete_recv );
ib_poll_cq ( ibdev, ipoib->data.cq, ipoib_data_complete_send,
ipoib_data_complete_recv );
ipoib_refill_recv ( ipoib, &ipoib->meta );
ipoib_refill_recv ( ipoib, &ipoib->data );
}
/**
* Enable/disable interrupts on IPoIB network device
*
* @v netdev Network device
* @v enable Interrupts should be enabled
*/
static void ipoib_irq ( struct net_device *netdev __unused,
int enable __unused ) {
/* No implementation */
}
/**
* Open IPoIB network device
*
* @v netdev Network device
* @ret rc Return status code
*/
static int ipoib_open ( struct net_device *netdev ) {
struct ipoib_device *ipoib = netdev->priv;
struct ib_device *ibdev = ipoib->ibdev;
int rc;
/* Attach to broadcast multicast GID */
if ( ( rc = ib_mcast_attach ( ibdev, ipoib->data.qp,
&ipoib->broadcast_gid ) ) != 0 ) {
DBG ( "Could not attach to broadcast GID: %s\n",
strerror ( rc ) );
return rc;
}
/* Fill receive rings */
ipoib_refill_recv ( ipoib, &ipoib->meta );
ipoib_refill_recv ( ipoib, &ipoib->data );
return 0;
}
/**
* Close IPoIB network device
*
* @v netdev Network device
*/
static void ipoib_close ( struct net_device *netdev ) {
struct ipoib_device *ipoib = netdev->priv;
struct ib_device *ibdev = ipoib->ibdev;
/* Detach from broadcast multicast GID */
ib_mcast_detach ( ibdev, ipoib->data.qp, &ipoib->broadcast_gid );
/* FIXME: should probably flush the receive ring */
}
/** IPoIB network device operations */
static struct net_device_operations ipoib_operations = {
.open = ipoib_open,
.close = ipoib_close,
.transmit = ipoib_transmit,
.poll = ipoib_poll,
.irq = ipoib_irq,
};
/**
* Join IPoIB broadcast group
*
* @v ipoib IPoIB device
* @ret rc Return status code
*/
static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
struct ib_device *ibdev = ipoib->ibdev;
unsigned int delay_ms;
int rc;
/* Make sure we have some receive descriptors */
ipoib_refill_recv ( ipoib, &ipoib->meta );
/* Send join request */
if ( ( rc = ipoib_mc_member_record ( ipoib, &ipoib->broadcast_gid,
1 ) ) != 0 ) {
DBGC ( ipoib, "IPoIB %p could not send broadcast join: %s\n",
ipoib, strerror ( rc ) );
return rc;
}
/* Wait for join to complete. Ideally we wouldn't delay for
* this long, but we need the queue key before we can set up
* the data queue pair, which we need before we can know the
* MAC address.
*/
for ( delay_ms = IPOIB_JOIN_MAX_DELAY_MS ; delay_ms ; delay_ms-- ) {
mdelay ( 1 );
ib_poll_cq ( ibdev, ipoib->meta.cq, ipoib_meta_complete_send,
ipoib_meta_complete_recv );
ipoib_refill_recv ( ipoib, &ipoib->meta );
if ( ipoib->broadcast_joined )
return 0;
}
DBGC ( ipoib, "IPoIB %p timed out waiting for broadcast join\n",
ipoib );
return -ETIMEDOUT;
}
/**
* Probe IPoIB device
*
* @v ibdev Infiniband device
* @ret rc Return status code
*/
int ipoib_probe ( struct ib_device *ibdev ) {
struct net_device *netdev;
struct ipoib_device *ipoib;
struct ipoib_mac *mac;
int rc;
/* Allocate network device */
netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
if ( ! netdev )
return -ENOMEM;
netdev_init ( netdev, &ipoib_operations );
ipoib = netdev->priv;
ib_set_ownerdata ( ibdev, netdev );
netdev->dev = ibdev->dev;
memset ( ipoib, 0, sizeof ( *ipoib ) );
ipoib->netdev = netdev;
ipoib->ibdev = ibdev;
/* Calculate broadcast GID */
memcpy ( &ipoib->broadcast_gid, &ipv4_broadcast_gid,
sizeof ( ipoib->broadcast_gid ) );
ipoib->broadcast_gid.u.words[2] = htons ( ibdev->pkey );
/* Allocate metadata queue set */
if ( ( rc = ipoib_create_qset ( ipoib, &ipoib->meta,
IPOIB_META_NUM_CQES,
IPOIB_META_NUM_SEND_WQES,
IPOIB_META_NUM_RECV_WQES,
IB_GLOBAL_QKEY ) ) != 0 ) {
DBGC ( ipoib, "IPoIB %p could not allocate metadata QP: %s\n",
ipoib, strerror ( rc ) );
goto err_create_meta_qset;
}
/* Join broadcast group */
if ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) {
DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
ipoib, strerror ( rc ) );
goto err_join_broadcast_group;
}
/* Allocate data queue set */
if ( ( rc = ipoib_create_qset ( ipoib, &ipoib->data,
IPOIB_DATA_NUM_CQES,
IPOIB_DATA_NUM_SEND_WQES,
IPOIB_DATA_NUM_RECV_WQES,
ipoib->data_qkey ) ) != 0 ) {
DBGC ( ipoib, "IPoIB %p could not allocate data QP: %s\n",
ipoib, strerror ( rc ) );
goto err_create_data_qset;
}
/* Construct MAC address */
mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
mac->qpn = htonl ( ipoib->data.qp->qpn );
memcpy ( &mac->gid, &ibdev->port_gid, sizeof ( mac->gid ) );
/* Register network device */
if ( ( rc = register_netdev ( netdev ) ) != 0 )
goto err_register_netdev;
return 0;
err_register_netdev:
ipoib_destroy_qset ( ipoib, &ipoib->data );
err_join_broadcast_group:
err_create_data_qset:
ipoib_destroy_qset ( ipoib, &ipoib->meta );
err_create_meta_qset:
netdev_nullify ( netdev );
netdev_put ( netdev );
return rc;
}
/**
* Remove IPoIB device
*
* @v ibdev Infiniband device
*/
void ipoib_remove ( struct ib_device *ibdev ) {
struct net_device *netdev = ib_get_ownerdata ( ibdev );
struct ipoib_device *ipoib = netdev->priv;
unregister_netdev ( netdev );
ipoib_destroy_qset ( ipoib, &ipoib->data );
ipoib_destroy_qset ( ipoib, &ipoib->meta );
netdev_nullify ( netdev );
netdev_put ( netdev );
}