|
1 2 3 4 5 6 7 8 9 10 11 12 |
import re
text = 'cn_000858'
# 提取所有数字 numbers = re.findall(r'\d+', text) print(numbers) # ['000858']
# 如果只需要第一个匹配结果 if numbers: result = numbers[0] print(result) # '000858' |
|
1 2 3 4 5 6 7 8 9 |
import re
text = 'cn_000858'
# 搜索第一个数字序列 match = re.search(r'\d+', text) if match: result = match.group() print(result) # '000858' |
|
1 2 3 4 5 6 7 8 9 |
import re
text = 'cn_000858'
# 匹配'cn_'后面的数字 match = re.search(r'cn_(\d+)', text) if match: result = match.group(1) # group(1)获取第一个捕获组 print(result) # '000858' |
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 |
import re
# 如果字符串中有多个数字 text = 'cn_000858_stock_123'
# 提取所有数字 numbers = re.findall(r'\d+', text) print(numbers) # ['000858', '123']
# 提取特定位置的数字 match = re.search(r'cn_(\d+)', text) if match: stock_code = match.group(1) print(f"股票代码: {stock_code}") # '000858' |
|
1 2 3 4 5 6 7 8 9 10 11 12 |
import re
text = 'cn_000858'
# 使用下划线分割,取最后一部分 parts = text.split('_') result = parts[-1] print(result) # '000858'
# 或者用正则split result = re.split(r'[^\d]+', text)[-1] print(result) # '000858' |
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
import re
text = 'cn_000858'
# 提取数字并转换为整数(会去掉前导零) match = re.search(r'\d+', text) if match: number_str = match.group() number_int = int(number_str) print(f"字符串: {number_str}, 整数: {number_int}") # 输出: 字符串: 000858, 整数: 858
# 如果需要保留前导零,保持字符串形式 result = match.group() print(f"保留前导零: {result}") # '000858' |
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
import re
def extract_stock_code(text): """ 从各种格式中提取股票代码 """ # 匹配多种模式: cn_000858, sh600000, sz000001, 000858等 patterns = [ r'cn_(\d+)', # cn_000858 r'(?:sh|sz)(\d+)', # sh600000, sz000001 r'^(\d{6})$', # 纯数字6位 r'_(\d{6})', # 下划线后6位数字 ]
for pattern in patterns: match = re.search(pattern, text, re.IGNORECASE) if match: return match.group(1)
return None
# 测试 test_cases = [ 'cn_000858', 'sh600000', 'sz000001', '000858', 'stock_cn_000858_data', ]
for text in test_cases: code = extract_stock_code(text) print(f"{text:20} -> [code]") |
|
1 2 3 4 5 6 7 8 9 10 11 12 |
import re
text = 'cn_000858'
# 使用命名捕获组 match = re.search(r'cn_(?P<code>\d+)', text) if match: code = match.group('code') print(f"股票代码: [code]") # '000858'
# 也可以用groupdict() print(match.groupdict()) # {'code': '000858'} |
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
import re
# 批量处理多个字符串 texts = [ 'cn_000858', 'cn_000001', 'sh600000', 'sz000002', ]
# 使用列表推导式 codes = [re.search(r'\d+', text).group() for text in texts if re.search(r'\d+', text)] print(codes) # ['000858', '000001', '600000', '000002']
# 更安全的方式(处理可能没有数字的情况) codes = [] for text in texts: match = re.search(r'\d+', text) if match: codes.append(match.group()) print(codes) |
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import re from typing import Optional, List
class StockCodeExtractor: """股票代码提取器"""
@staticmethod def extract(text: str) -> Optional[str]: """ 从文本中提取股票代码 支持格式: cn_000858, sh600000, sz000001, 000858等 """ if not text: return None
# 模式列表(按优先级排序) patterns = [ (r'cn_(\d{6})', 1), # cn_000858 (r'(?:sh|sz|hk)(\d{6})', 1), # sh600000, sz000001, hk00700 (r'^(\d{6})$', 1), # 纯6位数字 (r'_(\d{6})', 1), # 下划线后6位数字 (r'\b(\d{6})\b', 1), # 单词边界的6位数字 (r'\d+', 0), # 任意数字(最后备选) ]
for pattern, group_idx in patterns: match = re.search(pattern, text, re.IGNORECASE) if match: code = match.group(group_idx) # 验证是否为6位数字(股票代码通常是6位) if len(code) == 6 and code.isdigit(): return code # 如果不是6位但也没有其他匹配,返回它 elif group_idx == 0: # 最后一个模式 return code
return None
@staticmethod def extract_all(text: str) -> List[str]: """ 提取文本中所有可能的股票代码 """ # 提取所有6位数字序列 codes = re.findall(r'\b\d{6}\b', text) return codes
@staticmethod def validate(code: str) -> bool: """ 验证股票代码格式 """ return bool(re.match(r'^\d{6}$', code))
# 使用示例 if __name__ == "__main__": extractor = StockCodeExtractor()
test_cases = [ 'cn_000858', '深南电路 cn_000858', 'sh600000', 'sz000001', '000858', '股票代码: 000858', 'hk00700', ]
print("单个提取:") for text in test_cases: code = extractor.extract(text) is_valid = extractor.validate(code) if code else False print(f"{text:25} -> {code:10} (有效: {is_valid})")
print("\n批量提取:") text = "关注股票: cn_000858, sh600000, sz000001" codes = extractor.extract_all(text) print(f"文本: {text}") print(f"提取的代码: {codes}") |
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import re import json
def extract_stock_code_from_redis(data_str): """ 从Redis读取的数据中提取股票代码 """ try: # 先尝试解析JSON data = json.loads(data_str)
# 如果是字典,查找包含代码的字段 if isinstance(data, dict): # 尝试常见的字段名 code_fields = ['code', 'stock_code', 'symbol', 'ts_code'] for field in code_fields: if field in data: code = data[field] # 提取数字部分 match = re.search(r'\d+', str(code)) if match: return match.group()
# 如果没有找到,尝试从所有值中提取 for value in data.values(): match = re.search(r'\d+', str(value)) if match and len(match.group()) == 6: return match.group()
# 如果是字符串,直接提取 elif isinstance(data, str): match = re.search(r'\d{6}', data) if match: return match.group()
except json.JSONDecodeError: # 如果不是JSON,直接从字符串提取 match = re.search(r'\d{6}', data_str) if match: return match.group()
return None
# 测试 test_data = [ '{"name": "深南电路", "code": "cn_000858"}', '{"ts_code": "000858.SZ", "name": "深南电路"}', 'cn_000858', '000858', ]
for data in test_data: code = extract_stock_code_from_redis(data) print(f"{data:40} -> 代码: [code]") |
针对您的具体需求,最简单的代码:
|
1 2 3 4 5 6 7 8 9 10 |
import re
# 从 'cn_000858' 提取数字 text = 'cn_000858' result = re.search(r'\d+', text).group() print(result) # '000858'
# 或者一行代码 result = re.findall(r'\d+', 'cn_000858')[0] print(result) # '000858' |
| 模式 | 说明 | 示例 |
|---|---|---|
| \d+ | 匹配一个或多个数字 | ‘000858’ |
| \d{6} | 匹配恰好6个数字 | ‘000858’ |
| cn_(\d+) | 匹配’cn_'后的数字 | ‘000858’ |
| (?:sh|sz)(\d+) | 匹配sh或sz后的数字 | ‘600000’ |
| \b\d{6}\b | 单词边界的6位数字 | ‘000858’ |
选择最适合您场景的方法即可!