emacs 正则表达式问题

如何解决emacs 正则表达式问题

我对以下代码有疑问，不知道还能怎么写

(defun padname (strg)
  (string-match "[uU]_\\(.*\\)\\(_[0-9\]+\\)?" strg)
    (match-string 1 strg)
)

(padname "u_CLR_REQ_SUP_00")
"CLR_REQ_SUP_00" ==> expect "CLR_REQ_SUP"
(padname "u_CLR_REQ_SUP_0")
"CLR_REQ_SUP_0"  ==> expect "CLR_REQ_SUP"
(padname "u_PTO_AVDD_3P3_0")
"PTO_AVDD_3P3_0"  ==> expect "PTO_AVDD_3P3"
(padname "u_PTO_0")
"PTO_0"  ==> expect "PTO"
(padname "u_PTO")
"PTO" ==> as expected
(padname "u_BTNI")
"BTNI" ==> as expected

解决方法

您可以使第一组不贪婪，并在第二个可选组之后添加一个锚点

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col

import pandas as pd

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option","some-value") \
    .getOrCreate()

# Select file
filename = 'D:/Simple.parquet'

issue_num = 0 # Workaround to issues (Equivalent to no issue)
#issue_num = 1 # Issue 1 - Unable to show dataframe or select column with name containing invalid character(s)
#issue_num = 2 # Issue 2 - Unable to show dataframe or select column after rename (using withColumnRenamed)
#issue_num = 3 # Issue 3 - Unable to show dataframe or select column after rename (using toDF)
#issue_num = 4 # Issue 4 - Unable to extract rdd from renamed dataframe 
#issue_num = 5 # Issue 5 - Unable to select column with alias

if issue_num == 0:

    ################################################################################################
    # WORKAROUND - Create Spark data frame from Pandas dataframe
    df_pd = pd.read_parquet(filename)
    DF = spark.createDataFrame(df_pd)
    print('WORKAROUND')
    DF.show()
    # +-----------------------------------+-----------------------------------+
    # |SpeedReference_Final_01 (RifVel_G0)|SpeedReference_Final_02 (RifVel_G1)|
    # +-----------------------------------+-----------------------------------+
    # |                  553.5228271484375|                     720.3720703125|
    # |                  553.5228271484375|                     720.3720703125|
    # |                  553.5228271484375|                     720.3720703125|
    # |                  553.5228271484375|                     720.3720703125|
    # |                  553.5228271484375|                     720.3720703125|
    # +-----------------------------------+-----------------------------------+

    ################################################################################################
    # Correct management of columns with  invalid characters when using spark.createDataFrame
    # spark.createDataFrame: Create a dataframe with two columns with  invalid characters - OK
    # DFCREATED
    schema = StructType(
        [
            StructField("SpeedReference_Final_01 (RifVel_G0)",FloatType(),nullable=True),StructField("SpeedReference_Final_02 (RifVel_G1)",nullable=True)
        ]
    )

    row_in = [(553.523,720.372),(553.523,720.372)]

    rdd=spark.sparkContext.parallelize(row_in)
    DFCREATED = spark.createDataFrame(rdd,schema)
    DFCREATED.show()
    # +-----------------------------------+-----------------------------------+
    # |SpeedReference_Final_01 (RifVel_G0)|SpeedReference_Final_02 (RifVel_G1)|
    # +-----------------------------------+-----------------------------------+
    # |                            553.523|                            720.372|
    # |                            553.523|                            720.372|
    # |                            553.523|                            720.372|
    # |                            553.523|                            720.372|
    # |                            553.523|                            720.372|
    # +-----------------------------------+-----------------------------------+
    DF_SEL_VAR_CREATED = DFCREATED.select(DFCREATED.columns[0]).take(2)
    for el in DF_SEL_VAR_CREATED:
        print(el)
    #Row(SpeedReference_Final_01 (RifVel_G0)=553.5230102539062)
    #Row(SpeedReference_Final_01 (RifVel_G0)=553.5230102539062)
    
else:
    # spark.read: read file into dataframe - OK
    DF = spark.read.parquet(filename)
    print('ORIGINAL SCHEMA')
    DF.printSchema()
    # root
    #  |-- SpeedReference_Final_01 (RifVel_G0): float (nullable = true)
    #  |-- SpeedReference_Final_02 (RifVel_G1): float (nullable = true)
    
    if issue_num == 1:
        ###############################################################################################    
        # Issue 1 - Unable to show dataframe or select column with name containing invalid character(s)
        DF.show()
        # DF.select(DF.columns[0]).show()
        # DF_SEL_VAR = DF.select(DF.columns[0]).take(3)
        #ECC: Attribute name "SpeedReference_Final_01 (RifVel_G0)" contains invalid character(s) among ",;{}()\n\t=". Please use alias to rename it.
        # on all 3 previous statements

    elif issue_num == 2:
        ###############################################################################################    
        # Issue 2 - Unable to show dataframe or select column after rename (using withColumnRenamed)
        DFRENAMED = DF.withColumnRenamed('SpeedReference_Final_01 (RifVel_G0)','RifVelG0').withColumnRenamed('SpeedReference_Final_02 (RifVel_G1)','RifVelG1')
       
        print('RENAMED SCHEMA')
        DFRENAMED.printSchema()
        # root
        #  |-- RifVelG0: float (nullable = true)
        #  |-- RifVelG1: float (nullable = true)

        DFRENAMED.show()
        # DF_SEL_VAR_RENAMED = DFRENAMED.select(DFRENAMED.RifVelG0).take(2)
        #ECC: Attribute name "SpeedReference_Final_01 (RifVel_G0)" contains invalid character(s) among ",;{}()\n\t=". Please use alias to rename it.
        # on all 2 previous statements

    elif issue_num == 3:
        ###############################################################################################    
        # Issue 3 - Unable to show dataframe or select column after rename (using to_DF)
        DFRENAMED = DF.toDF('RifVelG0','RifVelG1')
    
        print('RENAMED SCHEMA')
        DFRENAMED.printSchema()
        # root
        #  |-- RifVelG0: float (nullable = true)
        #  |-- RifVelG1: float (nullable = true)

        DFRENAMED.show()
        # DF_SEL_VAR_RENAMED = DFRENAMED.select(DFRENAMED.RifVelG0).take(2)
        #ECC: Attribute name "SpeedReference_Final_01 (RifVel_G0)" contains invalid character(s) among ",;{}()\n\t=". Please use alias to rename it.
        # on all 2 previous statements

    elif issue_num == 4:
        ###############################################################################################    
        # Issue 4 - Unable to extract rdd from renamed dataframe 
        DFRENAMED = DF.withColumnRenamed('SpeedReference_Final_01 (RifVel_G0)','RifVelG1')
        DFRENAMED_rdd = DFRENAMED.rdd
        #ECC: Attribute name "SpeedReference_Final_01 (RifVel_G0)" contains invalid character(s) among ",;{}()\n\t=". Please use alias to rename it.

    elif issue_num == 5:
        ###############################################################################################    
        # Issue 5 - Unable to select column with alias
        DF_SEL_VAR = DF.select(col(DF.columns[0]).alias('RifVelG0')).take(3)
        #ECC: Attribute name "SpeedReference_Final_01 (RifVel_G0)" contains invalid character(s) among ",;{}()\n\t=". Please use alias to rename it.

另一种变体，在末尾使用带有 [^0-9_][0-9]* 的贪心点以在最后一个非数字处停止并在其后与可选组组合：

[Uu]_\\(.*[^0-9_][0-9]*\\)\\(_[0-9]+\\)?$

见regex proof。

说明

--------------------------------------------------------------------------------
  [Uu]                     any character of: 'U','u'
--------------------------------------------------------------------------------
  _                        '_'
--------------------------------------------------------------------------------
  (                        group and capture to \1:
--------------------------------------------------------------------------------
    .*                       any character except \n (0 or more times
                             (matching the most amount possible))
--------------------------------------------------------------------------------
    [^0-9_]                  any character except: '0' to '9','_'
--------------------------------------------------------------------------------
    [0-9]*                   any character of: '0' to '9' (0 or more
                             times (matching the most amount
                             possible))
--------------------------------------------------------------------------------
  )                        end of \1
--------------------------------------------------------------------------------
  (                        group and capture to \2 (optional
                           (matching the most amount possible)):
--------------------------------------------------------------------------------
    _                        '_'
--------------------------------------------------------------------------------
    [0-9]+                   any character of: '0' to '9' (1 or more
                             times (matching the most amount
                             possible))
--------------------------------------------------------------------------------
  )?                       end of \2 (NOTE: because you are using a
                           quantifier on this capture,only the LAST
                           repetition of the captured pattern will be
                           stored in \2)
--------------------------------------------------------------------------------
  $                        before an optional \n,and the end of the
                           string